{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4001803697580039, "eval_steps": 500, "global_step": 6656, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "auxiliary_loss_clip": 0.04675119, "auxiliary_loss_mlp": 0.02099638, "balance_loss_clip": 1.77205122, "balance_loss_mlp": 2.27198935, "epoch": 6.012325266796934e-05, "flos": 24455432897280.0, "grad_norm": 54.633257353768954, "language_loss": 2.84989119, "learning_rate": 0.0, "loss": 1.94246852, "num_input_tokens_seen": 19155, "router_z_loss_clip": 3.28125, "router_z_loss_mlp": 24.0, "step": 1, "time_per_iteration": 19.307077646255493 }, { "auxiliary_loss_clip": 0.03143228, "auxiliary_loss_mlp": 0.01384828, "balance_loss_clip": 1.18112338, "balance_loss_mlp": 1.51281738, "epoch": 0.00012024650533593868, "flos": 20225010188160.0, "grad_norm": 36.066244838101376, "language_loss": 1.82575774, "learning_rate": 4e-06, "loss": 1.8710382, "num_input_tokens_seen": 36175, "router_z_loss_clip": 2.03125, "router_z_loss_mlp": 16.25, "step": 2, "time_per_iteration": 2.4628143310546875 }, { "auxiliary_loss_clip": 0.03080973, "auxiliary_loss_mlp": 0.0137341, "balance_loss_clip": 1.17828774, "balance_loss_mlp": 1.51664853, "epoch": 0.000180369758003908, "flos": 22308835996800.0, "grad_norm": 32.82768891459672, "language_loss": 1.57214069, "learning_rate": 3.999999964312572e-06, "loss": 1.61668456, "num_input_tokens_seen": 54870, "router_z_loss_clip": 1.953125, "router_z_loss_mlp": 15.625, "step": 3, "time_per_iteration": 2.409597635269165 }, { "auxiliary_loss_clip": 0.03109568, "auxiliary_loss_mlp": 0.01353198, "balance_loss_clip": 1.127177, "balance_loss_mlp": 1.51108968, "epoch": 0.00024049301067187735, "flos": 22413680409600.0, "grad_norm": 24.052977564940104, "language_loss": 1.37424958, "learning_rate": 3.99999985725029e-06, "loss": 1.41887736, "num_input_tokens_seen": 74575, "router_z_loss_clip": 2.265625, "router_z_loss_mlp": 16.0, "step": 4, "time_per_iteration": 2.4043149948120117 }, { "auxiliary_loss_clip": 0.03127305, "auxiliary_loss_mlp": 0.01403953, "balance_loss_clip": 1.18498981, "balance_loss_mlp": 1.50664127, "epoch": 0.0003006162633398467, "flos": 21395927099520.0, "grad_norm": 21.146805273774273, "language_loss": 1.415416, "learning_rate": 3.999999678813158e-06, "loss": 1.46072853, "num_input_tokens_seen": 92580, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 16.25, "step": 5, "time_per_iteration": 2.3708319664001465 }, { "auxiliary_loss_clip": 0.03063031, "auxiliary_loss_mlp": 0.01405226, "balance_loss_clip": 1.18578613, "balance_loss_mlp": 1.50378633, "epoch": 0.000360739516007816, "flos": 21651316761600.0, "grad_norm": 6.818791628039744, "language_loss": 1.17620254, "learning_rate": 3.999999429001183e-06, "loss": 1.22088504, "num_input_tokens_seen": 109705, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 15.5625, "step": 6, "time_per_iteration": 2.3820738792419434 }, { "auxiliary_loss_clip": 0.02998424, "auxiliary_loss_mlp": 0.01370393, "balance_loss_clip": 1.17241049, "balance_loss_mlp": 1.50680101, "epoch": 0.0004208627686757854, "flos": 27158586312960.0, "grad_norm": 4.94199637075652, "language_loss": 1.14681077, "learning_rate": 3.9999991078143714e-06, "loss": 1.19049883, "num_input_tokens_seen": 129425, "router_z_loss_clip": 1.984375, "router_z_loss_mlp": 14.9375, "step": 7, "time_per_iteration": 2.679643392562866 }, { "auxiliary_loss_clip": 0.02951132, "auxiliary_loss_mlp": 0.01311899, "balance_loss_clip": 1.13022375, "balance_loss_mlp": 1.49760175, "epoch": 0.0004809860213437547, "flos": 31317824292480.0, "grad_norm": 23.61535064956306, "language_loss": 0.9544208, "learning_rate": 3.999998715252736e-06, "loss": 0.99705112, "num_input_tokens_seen": 149210, "router_z_loss_clip": 1.8125, "router_z_loss_mlp": 14.5625, "step": 8, "time_per_iteration": 2.6806282997131348 }, { "auxiliary_loss_clip": 0.02969223, "auxiliary_loss_mlp": 0.01355113, "balance_loss_clip": 1.16456866, "balance_loss_mlp": 1.50150013, "epoch": 0.000541109274011724, "flos": 32159056435200.0, "grad_norm": 5.007732554313651, "language_loss": 1.11774778, "learning_rate": 3.999998251316293e-06, "loss": 1.16099107, "num_input_tokens_seen": 169055, "router_z_loss_clip": 1.90625, "router_z_loss_mlp": 14.6875, "step": 9, "time_per_iteration": 2.6619770526885986 }, { "auxiliary_loss_clip": 0.02872593, "auxiliary_loss_mlp": 0.01315594, "balance_loss_clip": 1.14421892, "balance_loss_mlp": 1.50026464, "epoch": 0.0006012325266796934, "flos": 18915801914880.0, "grad_norm": 3.139622664196079, "language_loss": 1.06644702, "learning_rate": 3.9999977160050555e-06, "loss": 1.10832882, "num_input_tokens_seen": 188045, "router_z_loss_clip": 1.7109375, "router_z_loss_mlp": 13.75, "step": 10, "time_per_iteration": 2.564631223678589 }, { "auxiliary_loss_clip": 0.02790104, "auxiliary_loss_mlp": 0.01297754, "balance_loss_clip": 1.13801312, "balance_loss_mlp": 1.4966042, "epoch": 0.0006613557793476627, "flos": 20773879672320.0, "grad_norm": 8.748217914557543, "language_loss": 1.10217166, "learning_rate": 3.9999971093190445e-06, "loss": 1.14305019, "num_input_tokens_seen": 207035, "router_z_loss_clip": 1.6015625, "router_z_loss_mlp": 12.9375, "step": 11, "time_per_iteration": 2.6023213863372803 }, { "auxiliary_loss_clip": 0.02668227, "auxiliary_loss_mlp": 0.01248991, "balance_loss_clip": 1.09602213, "balance_loss_mlp": 1.48630834, "epoch": 0.000721479032015632, "flos": 16580740896000.0, "grad_norm": 4.094164265162429, "language_loss": 1.09107757, "learning_rate": 3.999996431258282e-06, "loss": 1.13024974, "num_input_tokens_seen": 223225, "router_z_loss_clip": 1.53125, "router_z_loss_mlp": 11.875, "step": 12, "time_per_iteration": 2.528917074203491 }, { "auxiliary_loss_clip": 0.02649503, "auxiliary_loss_mlp": 0.0122052, "balance_loss_clip": 1.08004415, "balance_loss_mlp": 1.48940086, "epoch": 0.0007816022846836014, "flos": 23804340618240.0, "grad_norm": 3.2974397637514343, "language_loss": 0.99341649, "learning_rate": 3.999995681822791e-06, "loss": 1.03211677, "num_input_tokens_seen": 242570, "router_z_loss_clip": 1.40625, "router_z_loss_mlp": 11.625, "step": 13, "time_per_iteration": 2.6297576427459717 }, { "auxiliary_loss_clip": 0.02601865, "auxiliary_loss_mlp": 0.01248735, "balance_loss_clip": 1.10196495, "balance_loss_mlp": 1.48509979, "epoch": 0.0008417255373515708, "flos": 19171191576960.0, "grad_norm": 4.238777303539045, "language_loss": 1.06023884, "learning_rate": 3.999994861012598e-06, "loss": 1.09874487, "num_input_tokens_seen": 261215, "router_z_loss_clip": 1.46875, "router_z_loss_mlp": 11.125, "step": 14, "time_per_iteration": 2.5761733055114746 }, { "auxiliary_loss_clip": 0.02539715, "auxiliary_loss_mlp": 0.01209442, "balance_loss_clip": 1.07802558, "balance_loss_mlp": 1.48428929, "epoch": 0.00090184879001954, "flos": 26394372362880.0, "grad_norm": 2.5879242228297796, "language_loss": 0.9828856, "learning_rate": 3.999993968827733e-06, "loss": 1.02037716, "num_input_tokens_seen": 280035, "router_z_loss_clip": 1.3125, "router_z_loss_mlp": 10.5625, "step": 15, "time_per_iteration": 2.602414846420288 }, { "auxiliary_loss_clip": 0.02482357, "auxiliary_loss_mlp": 0.01201394, "balance_loss_clip": 1.07560468, "balance_loss_mlp": 1.47833943, "epoch": 0.0009619720426875094, "flos": 24678391305600.0, "grad_norm": 3.0687345466579785, "language_loss": 0.99246669, "learning_rate": 3.999993005268228e-06, "loss": 1.02930415, "num_input_tokens_seen": 300265, "router_z_loss_clip": 1.2578125, "router_z_loss_mlp": 10.0625, "step": 16, "time_per_iteration": 2.56613826751709 }, { "auxiliary_loss_clip": 0.02431807, "auxiliary_loss_mlp": 0.01216623, "balance_loss_clip": 1.10113311, "balance_loss_mlp": 1.47049284, "epoch": 0.0010220952953554788, "flos": 18623543990400.0, "grad_norm": 3.1215760991906873, "language_loss": 1.01441908, "learning_rate": 3.999991970334118e-06, "loss": 1.05090332, "num_input_tokens_seen": 317375, "router_z_loss_clip": 1.15625, "router_z_loss_mlp": 9.625, "step": 17, "time_per_iteration": 5.36155366897583 }, { "auxiliary_loss_clip": 0.02306633, "auxiliary_loss_mlp": 0.01184744, "balance_loss_clip": 1.07888603, "balance_loss_mlp": 1.46231151, "epoch": 0.001082218548023448, "flos": 26141286850560.0, "grad_norm": 2.330774005332857, "language_loss": 0.99674374, "learning_rate": 3.999990864025439e-06, "loss": 1.03165746, "num_input_tokens_seen": 337975, "router_z_loss_clip": 1.0625, "router_z_loss_mlp": 8.4375, "step": 18, "time_per_iteration": 2.5890016555786133 }, { "auxiliary_loss_clip": 0.02248247, "auxiliary_loss_mlp": 0.011974, "balance_loss_clip": 1.07666516, "balance_loss_mlp": 1.44878852, "epoch": 0.0011423418006914173, "flos": 19608758046720.0, "grad_norm": 3.298295705619953, "language_loss": 0.91169536, "learning_rate": 3.99998968634223e-06, "loss": 0.94615185, "num_input_tokens_seen": 356635, "router_z_loss_clip": 1.203125, "router_z_loss_mlp": 8.0, "step": 19, "time_per_iteration": 2.57875919342041 }, { "auxiliary_loss_clip": 0.02162659, "auxiliary_loss_mlp": 0.01173688, "balance_loss_clip": 1.08027506, "balance_loss_mlp": 1.44382668, "epoch": 0.0012024650533593868, "flos": 17894382912000.0, "grad_norm": 2.4149043095773175, "language_loss": 1.03630507, "learning_rate": 3.999988437284535e-06, "loss": 1.06966853, "num_input_tokens_seen": 375625, "router_z_loss_clip": 0.9375, "router_z_loss_mlp": 7.1875, "step": 20, "time_per_iteration": 2.589801788330078 }, { "auxiliary_loss_clip": 0.02108563, "auxiliary_loss_mlp": 0.01184161, "balance_loss_clip": 1.08779252, "balance_loss_mlp": 1.42771745, "epoch": 0.001262588306027356, "flos": 21250967667840.0, "grad_norm": 2.4908713227019383, "language_loss": 0.9450531, "learning_rate": 3.999987116852396e-06, "loss": 0.97798038, "num_input_tokens_seen": 394350, "router_z_loss_clip": 0.96484375, "router_z_loss_mlp": 6.8125, "step": 21, "time_per_iteration": 2.6463260650634766 }, { "auxiliary_loss_clip": 0.0205811, "auxiliary_loss_mlp": 0.01174418, "balance_loss_clip": 1.08300817, "balance_loss_mlp": 1.41683114, "epoch": 0.0013227115586953253, "flos": 26102882488320.0, "grad_norm": 2.5239968437472995, "language_loss": 0.96045399, "learning_rate": 3.999985725045861e-06, "loss": 0.99277925, "num_input_tokens_seen": 413255, "router_z_loss_clip": 0.9140625, "router_z_loss_mlp": 6.40625, "step": 22, "time_per_iteration": 2.61918306350708 }, { "auxiliary_loss_clip": 0.020383, "auxiliary_loss_mlp": 0.01197761, "balance_loss_clip": 1.10830641, "balance_loss_mlp": 1.41866231, "epoch": 0.0013828348113632948, "flos": 23950242656640.0, "grad_norm": 1.8613463886433936, "language_loss": 0.83544517, "learning_rate": 3.999984261864982e-06, "loss": 0.86780572, "num_input_tokens_seen": 433065, "router_z_loss_clip": 0.89453125, "router_z_loss_mlp": 6.1875, "step": 23, "time_per_iteration": 2.5596020221710205 }, { "auxiliary_loss_clip": 0.02009691, "auxiliary_loss_mlp": 0.01183879, "balance_loss_clip": 1.09533048, "balance_loss_mlp": 1.41109812, "epoch": 0.001442958064031264, "flos": 15958972759680.0, "grad_norm": 2.070711216257559, "language_loss": 1.0156163, "learning_rate": 3.999982727309807e-06, "loss": 1.04755211, "num_input_tokens_seen": 451175, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 6.0, "step": 24, "time_per_iteration": 2.5257441997528076 }, { "auxiliary_loss_clip": 0.01941478, "auxiliary_loss_mlp": 0.01207248, "balance_loss_clip": 1.11855638, "balance_loss_mlp": 1.39769328, "epoch": 0.0015030813166992333, "flos": 18107527962240.0, "grad_norm": 3.830467856535708, "language_loss": 0.93178821, "learning_rate": 3.999981121380394e-06, "loss": 0.96327549, "num_input_tokens_seen": 468775, "router_z_loss_clip": 0.88671875, "router_z_loss_mlp": 5.4375, "step": 25, "time_per_iteration": 2.5001440048217773 }, { "auxiliary_loss_clip": 0.01920455, "auxiliary_loss_mlp": 0.01198773, "balance_loss_clip": 1.10831678, "balance_loss_mlp": 1.39311361, "epoch": 0.0015632045693672028, "flos": 22233528460800.0, "grad_norm": 2.107854477846936, "language_loss": 1.00563216, "learning_rate": 3.9999794440768e-06, "loss": 1.03682446, "num_input_tokens_seen": 488530, "router_z_loss_clip": 0.90625, "router_z_loss_mlp": 5.28125, "step": 26, "time_per_iteration": 2.573878526687622 }, { "auxiliary_loss_clip": 0.01911834, "auxiliary_loss_mlp": 0.01198717, "balance_loss_clip": 1.11126542, "balance_loss_mlp": 1.38922739, "epoch": 0.001623327822035172, "flos": 23990706789120.0, "grad_norm": 2.122014738041503, "language_loss": 0.89966214, "learning_rate": 3.999977695399084e-06, "loss": 0.93076766, "num_input_tokens_seen": 510495, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 5.21875, "step": 27, "time_per_iteration": 2.562899351119995 }, { "auxiliary_loss_clip": 0.01883569, "auxiliary_loss_mlp": 0.01206261, "balance_loss_clip": 1.12271953, "balance_loss_mlp": 1.37992167, "epoch": 0.0016834510747031415, "flos": 19676769108480.0, "grad_norm": 2.0938647514651083, "language_loss": 0.99451423, "learning_rate": 3.999975875347308e-06, "loss": 1.02541244, "num_input_tokens_seen": 528605, "router_z_loss_clip": 0.8359375, "router_z_loss_mlp": 5.03125, "step": 28, "time_per_iteration": 2.5451371669769287 }, { "auxiliary_loss_clip": 0.01877581, "auxiliary_loss_mlp": 0.01182278, "balance_loss_clip": 1.09611368, "balance_loss_mlp": 1.37079179, "epoch": 0.0017435743273711108, "flos": 20922749176320.0, "grad_norm": 2.1468105105526787, "language_loss": 0.96911222, "learning_rate": 3.999973983921538e-06, "loss": 0.99971074, "num_input_tokens_seen": 548515, "router_z_loss_clip": 0.859375, "router_z_loss_mlp": 5.0625, "step": 29, "time_per_iteration": 2.53570818901062 }, { "auxiliary_loss_clip": 0.01867424, "auxiliary_loss_mlp": 0.01180375, "balance_loss_clip": 1.09459209, "balance_loss_mlp": 1.36146772, "epoch": 0.00180369758003908, "flos": 19528178895360.0, "grad_norm": 3.1842010867040664, "language_loss": 1.11302865, "learning_rate": 3.9999720211218405e-06, "loss": 1.14350665, "num_input_tokens_seen": 564025, "router_z_loss_clip": 0.859375, "router_z_loss_mlp": 5.0625, "step": 30, "time_per_iteration": 2.5348143577575684 }, { "auxiliary_loss_clip": 0.0182161, "auxiliary_loss_mlp": 0.01175429, "balance_loss_clip": 1.08797705, "balance_loss_mlp": 1.35064387, "epoch": 0.0018638208327070496, "flos": 27451961400960.0, "grad_norm": 2.3505293065017008, "language_loss": 0.96483362, "learning_rate": 3.999969986948286e-06, "loss": 0.99480397, "num_input_tokens_seen": 583345, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 4.71875, "step": 31, "time_per_iteration": 2.5954768657684326 }, { "auxiliary_loss_clip": 0.01802228, "auxiliary_loss_mlp": 0.01171508, "balance_loss_clip": 1.08448565, "balance_loss_mlp": 1.33967805, "epoch": 0.0019239440853750188, "flos": 13588614489600.0, "grad_norm": 2.0416038758443933, "language_loss": 0.88528389, "learning_rate": 3.999967881400949e-06, "loss": 0.91502124, "num_input_tokens_seen": 600010, "router_z_loss_clip": 0.8671875, "router_z_loss_mlp": 4.625, "step": 32, "time_per_iteration": 2.538137435913086 }, { "auxiliary_loss_clip": 0.01803808, "auxiliary_loss_mlp": 0.0116373, "balance_loss_clip": 1.07575357, "balance_loss_mlp": 1.33184898, "epoch": 0.001984067338042988, "flos": 11253099623040.0, "grad_norm": 2.7298373076256124, "language_loss": 0.87022352, "learning_rate": 3.999965704479901e-06, "loss": 0.89989889, "num_input_tokens_seen": 616295, "router_z_loss_clip": 0.87890625, "router_z_loss_mlp": 4.71875, "step": 33, "time_per_iteration": 2.531190872192383 }, { "auxiliary_loss_clip": 0.01769014, "auxiliary_loss_mlp": 0.01165698, "balance_loss_clip": 1.07939088, "balance_loss_mlp": 1.32378912, "epoch": 0.0020441905907109576, "flos": 22385051228160.0, "grad_norm": 2.060972244943001, "language_loss": 0.86651742, "learning_rate": 3.999963456185222e-06, "loss": 0.89586449, "num_input_tokens_seen": 637640, "router_z_loss_clip": 0.86328125, "router_z_loss_mlp": 4.4375, "step": 34, "time_per_iteration": 2.5855860710144043 }, { "auxiliary_loss_clip": 0.01742666, "auxiliary_loss_mlp": 0.0113303, "balance_loss_clip": 1.04314601, "balance_loss_mlp": 1.30449271, "epoch": 0.0021043138433789266, "flos": 49776858489600.0, "grad_norm": 2.195255265685214, "language_loss": 0.70702922, "learning_rate": 3.999961136516991e-06, "loss": 0.7357862, "num_input_tokens_seen": 659710, "router_z_loss_clip": 0.8984375, "router_z_loss_mlp": 4.375, "step": 35, "time_per_iteration": 2.8236277103424072 }, { "auxiliary_loss_clip": 0.0174311, "auxiliary_loss_mlp": 0.01142913, "balance_loss_clip": 1.05574691, "balance_loss_mlp": 1.3051616, "epoch": 0.002164437096046896, "flos": 20556929283840.0, "grad_norm": 2.0848739687058857, "language_loss": 0.8459003, "learning_rate": 3.999958745475293e-06, "loss": 0.87476051, "num_input_tokens_seen": 679670, "router_z_loss_clip": 0.87109375, "router_z_loss_mlp": 4.375, "step": 36, "time_per_iteration": 2.5762763023376465 }, { "auxiliary_loss_clip": 0.01730108, "auxiliary_loss_mlp": 0.01154336, "balance_loss_clip": 1.06540632, "balance_loss_mlp": 1.29269588, "epoch": 0.0022245603487148656, "flos": 26541077362560.0, "grad_norm": 3.013775673998062, "language_loss": 0.87594348, "learning_rate": 3.999956283060211e-06, "loss": 0.9047879, "num_input_tokens_seen": 700170, "router_z_loss_clip": 0.88671875, "router_z_loss_mlp": 4.375, "step": 37, "time_per_iteration": 2.625054359436035 }, { "auxiliary_loss_clip": 0.01715788, "auxiliary_loss_mlp": 0.01164623, "balance_loss_clip": 1.07306981, "balance_loss_mlp": 1.29187417, "epoch": 0.0022846836013828346, "flos": 20337185986560.0, "grad_norm": 1.8630027244681364, "language_loss": 0.99655676, "learning_rate": 3.9999537492718345e-06, "loss": 1.02536082, "num_input_tokens_seen": 718545, "router_z_loss_clip": 0.9140625, "router_z_loss_mlp": 4.25, "step": 38, "time_per_iteration": 2.559971332550049 }, { "auxiliary_loss_clip": 0.01694284, "auxiliary_loss_mlp": 0.01137003, "balance_loss_clip": 1.04478264, "balance_loss_mlp": 1.28287303, "epoch": 0.002344806854050804, "flos": 26246445465600.0, "grad_norm": 2.21150037379473, "language_loss": 0.81611729, "learning_rate": 3.999951144110252e-06, "loss": 0.84443015, "num_input_tokens_seen": 739865, "router_z_loss_clip": 0.921875, "router_z_loss_mlp": 4.125, "step": 39, "time_per_iteration": 2.6174371242523193 }, { "auxiliary_loss_clip": 0.01697233, "auxiliary_loss_mlp": 0.01149935, "balance_loss_clip": 1.05900264, "balance_loss_mlp": 1.27529538, "epoch": 0.0024049301067187736, "flos": 11800747209600.0, "grad_norm": 2.8311092766346047, "language_loss": 0.83641642, "learning_rate": 3.999948467575558e-06, "loss": 0.86488813, "num_input_tokens_seen": 755770, "router_z_loss_clip": 0.90625, "router_z_loss_mlp": 4.21875, "step": 40, "time_per_iteration": 2.546539783477783 }, { "auxiliary_loss_clip": 0.01683985, "auxiliary_loss_mlp": 0.01153351, "balance_loss_clip": 1.06179821, "balance_loss_mlp": 1.2719717, "epoch": 0.0024650533593867426, "flos": 20630456340480.0, "grad_norm": 2.7020804722368825, "language_loss": 0.88915122, "learning_rate": 3.999945719667849e-06, "loss": 0.91752458, "num_input_tokens_seen": 773440, "router_z_loss_clip": 0.9140625, "router_z_loss_mlp": 4.125, "step": 41, "time_per_iteration": 2.5499825477600098 }, { "auxiliary_loss_clip": 0.01660993, "auxiliary_loss_mlp": 0.01141511, "balance_loss_clip": 1.05691957, "balance_loss_mlp": 1.26325691, "epoch": 0.002525176612054712, "flos": 18405127324800.0, "grad_norm": 2.0646008580873874, "language_loss": 0.92708147, "learning_rate": 3.999942900387221e-06, "loss": 0.9551065, "num_input_tokens_seen": 790455, "router_z_loss_clip": 0.84765625, "router_z_loss_mlp": 3.96875, "step": 42, "time_per_iteration": 2.537623167037964 }, { "auxiliary_loss_clip": 0.01651451, "auxiliary_loss_mlp": 0.01167617, "balance_loss_clip": 1.07568288, "balance_loss_mlp": 1.2582351, "epoch": 0.0025852998647226816, "flos": 28182763313280.0, "grad_norm": 5.185354719665505, "language_loss": 0.93673301, "learning_rate": 3.999940009733775e-06, "loss": 0.96492368, "num_input_tokens_seen": 810645, "router_z_loss_clip": 0.91796875, "router_z_loss_mlp": 3.9375, "step": 43, "time_per_iteration": 2.595491647720337 }, { "auxiliary_loss_clip": 0.016468, "auxiliary_loss_mlp": 0.01148236, "balance_loss_clip": 1.06078386, "balance_loss_mlp": 1.25160539, "epoch": 0.0026454231173906506, "flos": 14282233937280.0, "grad_norm": 5.263731246682537, "language_loss": 0.88909531, "learning_rate": 3.9999370477076146e-06, "loss": 0.91704565, "num_input_tokens_seen": 827470, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 3.953125, "step": 44, "time_per_iteration": 2.51104736328125 }, { "auxiliary_loss_clip": 0.01636214, "auxiliary_loss_mlp": 0.01135016, "balance_loss_clip": 1.05443072, "balance_loss_mlp": 1.24934185, "epoch": 0.00270554637005862, "flos": 22418114152320.0, "grad_norm": 2.629764911890971, "language_loss": 0.94870114, "learning_rate": 3.9999340143088455e-06, "loss": 0.97641337, "num_input_tokens_seen": 847285, "router_z_loss_clip": 0.8046875, "router_z_loss_mlp": 3.875, "step": 45, "time_per_iteration": 2.719869613647461 }, { "auxiliary_loss_clip": 0.01637104, "auxiliary_loss_mlp": 0.01135423, "balance_loss_clip": 1.05626822, "balance_loss_mlp": 1.24332213, "epoch": 0.0027656696227265896, "flos": 23984702035200.0, "grad_norm": 1.653387096013355, "language_loss": 0.99931061, "learning_rate": 3.999930909537576e-06, "loss": 1.02703583, "num_input_tokens_seen": 867545, "router_z_loss_clip": 0.79296875, "router_z_loss_mlp": 3.9375, "step": 46, "time_per_iteration": 2.7189016342163086 }, { "auxiliary_loss_clip": 0.01620508, "auxiliary_loss_mlp": 0.01147488, "balance_loss_clip": 1.06122851, "balance_loss_mlp": 1.23951733, "epoch": 0.0028257928753945586, "flos": 37668001731840.0, "grad_norm": 2.1541038360868874, "language_loss": 0.84128428, "learning_rate": 3.999927733393916e-06, "loss": 0.86896425, "num_input_tokens_seen": 889915, "router_z_loss_clip": 0.86328125, "router_z_loss_mlp": 3.8125, "step": 47, "time_per_iteration": 2.7456581592559814 }, { "auxiliary_loss_clip": 0.01600178, "auxiliary_loss_mlp": 0.01142669, "balance_loss_clip": 1.05531228, "balance_loss_mlp": 1.23255134, "epoch": 0.002885916128062528, "flos": 22453481226240.0, "grad_norm": 1.773987463822889, "language_loss": 0.85155529, "learning_rate": 3.99992448587798e-06, "loss": 0.87898374, "num_input_tokens_seen": 908975, "router_z_loss_clip": 0.875, "router_z_loss_mlp": 3.671875, "step": 48, "time_per_iteration": 2.7141005992889404 }, { "auxiliary_loss_clip": 0.01593724, "auxiliary_loss_mlp": 0.01130042, "balance_loss_clip": 1.04640508, "balance_loss_mlp": 1.22513652, "epoch": 0.0029460393807304976, "flos": 27011671845120.0, "grad_norm": 3.2920442272937573, "language_loss": 0.8657636, "learning_rate": 3.999921166989884e-06, "loss": 0.89300132, "num_input_tokens_seen": 929810, "router_z_loss_clip": 0.8359375, "router_z_loss_mlp": 3.6875, "step": 49, "time_per_iteration": 2.6943843364715576 }, { "auxiliary_loss_clip": 0.01589589, "auxiliary_loss_mlp": 0.01155181, "balance_loss_clip": 1.07488132, "balance_loss_mlp": 1.2209065, "epoch": 0.0030061626333984666, "flos": 15850916501760.0, "grad_norm": 2.2248531444274304, "language_loss": 0.88049072, "learning_rate": 3.999917776729746e-06, "loss": 0.90793836, "num_input_tokens_seen": 948650, "router_z_loss_clip": 0.8046875, "router_z_loss_mlp": 3.6875, "step": 50, "time_per_iteration": 2.5544683933258057 }, { "auxiliary_loss_clip": 0.01584487, "auxiliary_loss_mlp": 0.01129517, "balance_loss_clip": 1.05141068, "balance_loss_mlp": 1.21761751, "epoch": 0.003066285886066436, "flos": 31825845619200.0, "grad_norm": 4.248983325462892, "language_loss": 0.83911979, "learning_rate": 3.999914315097687e-06, "loss": 0.86625981, "num_input_tokens_seen": 966455, "router_z_loss_clip": 0.78125, "router_z_loss_mlp": 3.671875, "step": 51, "time_per_iteration": 2.647359609603882 }, { "auxiliary_loss_clip": 0.01567797, "auxiliary_loss_mlp": 0.01153382, "balance_loss_clip": 1.07098413, "balance_loss_mlp": 1.21582484, "epoch": 0.0031264091387344056, "flos": 41425878188160.0, "grad_norm": 1.8429176982897937, "language_loss": 0.91580838, "learning_rate": 3.999910782093829e-06, "loss": 0.94302016, "num_input_tokens_seen": 988110, "router_z_loss_clip": 0.82421875, "router_z_loss_mlp": 3.53125, "step": 52, "time_per_iteration": 2.7222397327423096 }, { "auxiliary_loss_clip": 0.01572168, "auxiliary_loss_mlp": 0.01145655, "balance_loss_clip": 1.06015766, "balance_loss_mlp": 1.21593559, "epoch": 0.0031865323914023747, "flos": 23439812446080.0, "grad_norm": 2.1818325670174197, "language_loss": 0.88794315, "learning_rate": 3.999907177718301e-06, "loss": 0.91512132, "num_input_tokens_seen": 1008550, "router_z_loss_clip": 0.85546875, "router_z_loss_mlp": 3.5625, "step": 53, "time_per_iteration": 2.5715830326080322 }, { "auxiliary_loss_clip": 0.01566318, "auxiliary_loss_mlp": 0.01159099, "balance_loss_clip": 1.07393575, "balance_loss_mlp": 1.21152437, "epoch": 0.003246655644070344, "flos": 14428310532480.0, "grad_norm": 2.336043391261303, "language_loss": 0.79594576, "learning_rate": 3.99990350197123e-06, "loss": 0.82319993, "num_input_tokens_seen": 1026840, "router_z_loss_clip": 0.8515625, "router_z_loss_mlp": 3.546875, "step": 54, "time_per_iteration": 2.5533878803253174 }, { "auxiliary_loss_clip": 0.01562585, "auxiliary_loss_mlp": 0.01153723, "balance_loss_clip": 1.06917977, "balance_loss_mlp": 1.20680356, "epoch": 0.0033067788967383136, "flos": 35916793246080.0, "grad_norm": 2.6399092445371815, "language_loss": 0.77894688, "learning_rate": 3.999899754852747e-06, "loss": 0.80610991, "num_input_tokens_seen": 1048875, "router_z_loss_clip": 0.84765625, "router_z_loss_mlp": 3.5625, "step": 55, "time_per_iteration": 2.686530828475952 }, { "auxiliary_loss_clip": 0.01560018, "auxiliary_loss_mlp": 0.01133782, "balance_loss_clip": 1.05348277, "balance_loss_mlp": 1.20421016, "epoch": 0.003366902149406283, "flos": 22957836860160.0, "grad_norm": 3.5583647739648487, "language_loss": 0.83599192, "learning_rate": 3.999895936362987e-06, "loss": 0.86292994, "num_input_tokens_seen": 1066435, "router_z_loss_clip": 0.8046875, "router_z_loss_mlp": 3.5625, "step": 56, "time_per_iteration": 4.309218645095825 }, { "auxiliary_loss_clip": 0.01562721, "auxiliary_loss_mlp": 0.01154903, "balance_loss_clip": 1.07555699, "balance_loss_mlp": 1.19991803, "epoch": 0.003427025402074252, "flos": 26581506583680.0, "grad_norm": 1.804282673250652, "language_loss": 0.90663362, "learning_rate": 3.9998920465020845e-06, "loss": 0.93380976, "num_input_tokens_seen": 1090330, "router_z_loss_clip": 0.79296875, "router_z_loss_mlp": 3.625, "step": 57, "time_per_iteration": 4.142320156097412 }, { "auxiliary_loss_clip": 0.01542008, "auxiliary_loss_mlp": 0.01138251, "balance_loss_clip": 1.05480409, "balance_loss_mlp": 1.20625949, "epoch": 0.0034871486547422216, "flos": 23950068099840.0, "grad_norm": 2.146597421329481, "language_loss": 0.9681412, "learning_rate": 3.999888085270179e-06, "loss": 0.9949438, "num_input_tokens_seen": 1109840, "router_z_loss_clip": 0.8359375, "router_z_loss_mlp": 3.34375, "step": 58, "time_per_iteration": 2.59883451461792 }, { "auxiliary_loss_clip": 0.01538591, "auxiliary_loss_mlp": 0.01136056, "balance_loss_clip": 1.0576638, "balance_loss_mlp": 1.20107901, "epoch": 0.003547271907410191, "flos": 21213924848640.0, "grad_norm": 2.2963116984050234, "language_loss": 0.8566339, "learning_rate": 3.9998840526674135e-06, "loss": 0.88338029, "num_input_tokens_seen": 1128415, "router_z_loss_clip": 0.78125, "router_z_loss_mlp": 3.375, "step": 59, "time_per_iteration": 2.588517189025879 }, { "auxiliary_loss_clip": 0.01543832, "auxiliary_loss_mlp": 0.01121869, "balance_loss_clip": 1.04376316, "balance_loss_mlp": 1.19997227, "epoch": 0.00360739516007816, "flos": 16504071816960.0, "grad_norm": 3.6106206237954153, "language_loss": 0.90589786, "learning_rate": 3.999879948693929e-06, "loss": 0.9325549, "num_input_tokens_seen": 1146515, "router_z_loss_clip": 0.78125, "router_z_loss_mlp": 3.4375, "step": 60, "time_per_iteration": 2.5708141326904297 }, { "auxiliary_loss_clip": 0.01536828, "auxiliary_loss_mlp": 0.01128284, "balance_loss_clip": 1.05480361, "balance_loss_mlp": 1.19439209, "epoch": 0.0036675184127461296, "flos": 19463763703680.0, "grad_norm": 3.0867057656717347, "language_loss": 0.86699647, "learning_rate": 3.999875773349874e-06, "loss": 0.89364761, "num_input_tokens_seen": 1166330, "router_z_loss_clip": 0.734375, "router_z_loss_mlp": 3.421875, "step": 61, "time_per_iteration": 2.733415365219116 }, { "auxiliary_loss_clip": 0.01530636, "auxiliary_loss_mlp": 0.01119644, "balance_loss_clip": 1.0487386, "balance_loss_mlp": 1.19737685, "epoch": 0.003727641665414099, "flos": 20956335770880.0, "grad_norm": 1.9044505707105237, "language_loss": 0.86141676, "learning_rate": 3.999871526635397e-06, "loss": 0.88791955, "num_input_tokens_seen": 1186010, "router_z_loss_clip": 0.7109375, "router_z_loss_mlp": 3.328125, "step": 62, "time_per_iteration": 2.644710063934326 }, { "auxiliary_loss_clip": 0.01525647, "auxiliary_loss_mlp": 0.01135761, "balance_loss_clip": 1.06042099, "balance_loss_mlp": 1.19628096, "epoch": 0.003787764918082068, "flos": 18405057502080.0, "grad_norm": 1.984669322035565, "language_loss": 0.94111091, "learning_rate": 3.999867208550649e-06, "loss": 0.96772498, "num_input_tokens_seen": 1204985, "router_z_loss_clip": 0.75, "router_z_loss_mlp": 3.296875, "step": 63, "time_per_iteration": 2.6939220428466797 }, { "auxiliary_loss_clip": 0.01522687, "auxiliary_loss_mlp": 0.01124747, "balance_loss_clip": 1.05078948, "balance_loss_mlp": 1.19448018, "epoch": 0.0038478881707500376, "flos": 12458406090240.0, "grad_norm": 6.071362458483904, "language_loss": 0.95735359, "learning_rate": 3.999862819095785e-06, "loss": 0.98382795, "num_input_tokens_seen": 1223545, "router_z_loss_clip": 0.7421875, "router_z_loss_mlp": 3.28125, "step": 64, "time_per_iteration": 2.6445887088775635 }, { "auxiliary_loss_clip": 0.01532445, "auxiliary_loss_mlp": 0.01135816, "balance_loss_clip": 1.06204915, "balance_loss_mlp": 1.19461608, "epoch": 0.003908011423418007, "flos": 13552479365760.0, "grad_norm": 1.8832393227624737, "language_loss": 0.82848072, "learning_rate": 3.999858358270962e-06, "loss": 0.85516334, "num_input_tokens_seen": 1241175, "router_z_loss_clip": 0.73828125, "router_z_loss_mlp": 3.375, "step": 65, "time_per_iteration": 2.6585493087768555 }, { "auxiliary_loss_clip": 0.01523418, "auxiliary_loss_mlp": 0.01124821, "balance_loss_clip": 1.05129278, "balance_loss_mlp": 1.19321489, "epoch": 0.003968134676085976, "flos": 18332473052160.0, "grad_norm": 1.8421961963241207, "language_loss": 0.83222592, "learning_rate": 3.999853826076338e-06, "loss": 0.85870826, "num_input_tokens_seen": 1259315, "router_z_loss_clip": 0.734375, "router_z_loss_mlp": 3.296875, "step": 66, "time_per_iteration": 2.7048535346984863 }, { "auxiliary_loss_clip": 0.01520742, "auxiliary_loss_mlp": 0.01120062, "balance_loss_clip": 1.03871369, "balance_loss_mlp": 1.18489003, "epoch": 0.004028257928753946, "flos": 20484205188480.0, "grad_norm": 2.219008712215282, "language_loss": 0.94136697, "learning_rate": 3.999849222512075e-06, "loss": 0.96777511, "num_input_tokens_seen": 1277055, "router_z_loss_clip": 0.8125, "router_z_loss_mlp": 3.359375, "step": 67, "time_per_iteration": 2.617933511734009 }, { "auxiliary_loss_clip": 0.01507726, "auxiliary_loss_mlp": 0.01130344, "balance_loss_clip": 1.04932952, "balance_loss_mlp": 1.18441331, "epoch": 0.004088381181421915, "flos": 18842833440000.0, "grad_norm": 2.354307623528934, "language_loss": 0.92092949, "learning_rate": 3.9998445475783365e-06, "loss": 0.94731021, "num_input_tokens_seen": 1294355, "router_z_loss_clip": 0.80859375, "router_z_loss_mlp": 3.234375, "step": 68, "time_per_iteration": 2.577284097671509 }, { "auxiliary_loss_clip": 0.01497895, "auxiliary_loss_mlp": 0.01131936, "balance_loss_clip": 1.0565958, "balance_loss_mlp": 1.18115139, "epoch": 0.004148504434089885, "flos": 19426790707200.0, "grad_norm": 2.9681617808792784, "language_loss": 0.9445743, "learning_rate": 3.999839801275292e-06, "loss": 0.97087264, "num_input_tokens_seen": 1313525, "router_z_loss_clip": 0.75, "router_z_loss_mlp": 3.171875, "step": 69, "time_per_iteration": 2.5266003608703613 }, { "auxiliary_loss_clip": 0.01495954, "auxiliary_loss_mlp": 0.01127572, "balance_loss_clip": 1.05685711, "balance_loss_mlp": 1.18270397, "epoch": 0.004208627686757853, "flos": 20810049707520.0, "grad_norm": 2.724199016749798, "language_loss": 0.96660733, "learning_rate": 3.999834983603108e-06, "loss": 0.99284261, "num_input_tokens_seen": 1330505, "router_z_loss_clip": 0.70703125, "router_z_loss_mlp": 3.125, "step": 70, "time_per_iteration": 2.590308666229248 }, { "auxiliary_loss_clip": 0.01504126, "auxiliary_loss_mlp": 0.01123484, "balance_loss_clip": 1.04795337, "balance_loss_mlp": 1.17364883, "epoch": 0.004268750939425823, "flos": 19097629608960.0, "grad_norm": 6.624474777525524, "language_loss": 0.91867542, "learning_rate": 3.9998300945619576e-06, "loss": 0.94495147, "num_input_tokens_seen": 1349615, "router_z_loss_clip": 0.7578125, "router_z_loss_mlp": 3.3125, "step": 71, "time_per_iteration": 2.5628535747528076 }, { "auxiliary_loss_clip": 0.01797056, "auxiliary_loss_mlp": 0.01431044, "balance_loss_clip": 1.39118016, "balance_loss_mlp": 1.42536378, "epoch": 0.004328874192093792, "flos": 52436889377280.0, "grad_norm": 2.2573204514841265, "language_loss": 0.65688801, "learning_rate": 3.999825134152016e-06, "loss": 0.68916899, "num_input_tokens_seen": 1410275, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 3.71875, "step": 72, "time_per_iteration": 3.1397647857666016 }, { "auxiliary_loss_clip": 0.01734601, "auxiliary_loss_mlp": 0.01323334, "balance_loss_clip": 1.28423309, "balance_loss_mlp": 1.40884876, "epoch": 0.004388997444761762, "flos": 66469459115520.0, "grad_norm": 2.081010780673298, "language_loss": 0.63639885, "learning_rate": 3.999820102373459e-06, "loss": 0.66697824, "num_input_tokens_seen": 1473020, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 3.25, "step": 73, "time_per_iteration": 3.1842033863067627 }, { "auxiliary_loss_clip": 0.01489288, "auxiliary_loss_mlp": 0.01117901, "balance_loss_clip": 1.04642332, "balance_loss_mlp": 1.17090213, "epoch": 0.004449120697429731, "flos": 18951971950080.0, "grad_norm": 3.399614259340687, "language_loss": 0.83614337, "learning_rate": 3.999814999226467e-06, "loss": 0.86221528, "num_input_tokens_seen": 1490385, "router_z_loss_clip": 0.71484375, "router_z_loss_mlp": 3.1875, "step": 74, "time_per_iteration": 2.5562846660614014 }, { "auxiliary_loss_clip": 0.0150131, "auxiliary_loss_mlp": 0.01129769, "balance_loss_clip": 1.05662227, "balance_loss_mlp": 1.17718089, "epoch": 0.004509243950097701, "flos": 21104437224960.0, "grad_norm": 2.068454009829982, "language_loss": 0.94926447, "learning_rate": 3.999809824711222e-06, "loss": 0.97557527, "num_input_tokens_seen": 1509725, "router_z_loss_clip": 0.73046875, "router_z_loss_mlp": 3.25, "step": 75, "time_per_iteration": 2.530402421951294 }, { "auxiliary_loss_clip": 0.01481729, "auxiliary_loss_mlp": 0.01138677, "balance_loss_clip": 1.06629276, "balance_loss_mlp": 1.17169082, "epoch": 0.004569367202765669, "flos": 20697838997760.0, "grad_norm": 2.4761529519656147, "language_loss": 0.86071384, "learning_rate": 3.9998045788279075e-06, "loss": 0.88691783, "num_input_tokens_seen": 1527245, "router_z_loss_clip": 0.72265625, "router_z_loss_mlp": 3.09375, "step": 76, "time_per_iteration": 2.5472307205200195 }, { "auxiliary_loss_clip": 0.01492312, "auxiliary_loss_mlp": 0.01127005, "balance_loss_clip": 1.05362022, "balance_loss_mlp": 1.17710352, "epoch": 0.004629490455433639, "flos": 28657198045440.0, "grad_norm": 1.769193740793799, "language_loss": 0.90393454, "learning_rate": 3.9997992615767125e-06, "loss": 0.93012774, "num_input_tokens_seen": 1548930, "router_z_loss_clip": 0.734375, "router_z_loss_mlp": 3.15625, "step": 77, "time_per_iteration": 2.5652127265930176 }, { "auxiliary_loss_clip": 0.01483891, "auxiliary_loss_mlp": 0.01125642, "balance_loss_clip": 1.05130339, "balance_loss_mlp": 1.17786443, "epoch": 0.004689613708101608, "flos": 11071621042560.0, "grad_norm": 3.8040509470686623, "language_loss": 0.8998087, "learning_rate": 3.9997938729578266e-06, "loss": 0.92590404, "num_input_tokens_seen": 1565695, "router_z_loss_clip": 0.7421875, "router_z_loss_mlp": 3.0625, "step": 78, "time_per_iteration": 2.5184528827667236 }, { "auxiliary_loss_clip": 0.01486718, "auxiliary_loss_mlp": 0.01138278, "balance_loss_clip": 1.06508303, "balance_loss_mlp": 1.17505836, "epoch": 0.004749736960769578, "flos": 21798021761280.0, "grad_norm": 7.844066050884662, "language_loss": 0.80664903, "learning_rate": 3.99978841297144e-06, "loss": 0.83289897, "num_input_tokens_seen": 1582625, "router_z_loss_clip": 0.734375, "router_z_loss_mlp": 3.125, "step": 79, "time_per_iteration": 2.558370351791382 }, { "auxiliary_loss_clip": 0.01497252, "auxiliary_loss_mlp": 0.01123498, "balance_loss_clip": 1.05368876, "balance_loss_mlp": 1.18235707, "epoch": 0.004809860213437547, "flos": 19791563258880.0, "grad_norm": 3.243026160233554, "language_loss": 0.90004849, "learning_rate": 3.99978288161775e-06, "loss": 0.92625594, "num_input_tokens_seen": 1601725, "router_z_loss_clip": 0.69921875, "router_z_loss_mlp": 3.15625, "step": 80, "time_per_iteration": 2.5560710430145264 }, { "auxiliary_loss_clip": 0.01486012, "auxiliary_loss_mlp": 0.01134552, "balance_loss_clip": 1.06226373, "balance_loss_mlp": 1.18250656, "epoch": 0.004869983466105517, "flos": 26573232591360.0, "grad_norm": 2.165405405963283, "language_loss": 0.93016237, "learning_rate": 3.999777278896952e-06, "loss": 0.95636809, "num_input_tokens_seen": 1622420, "router_z_loss_clip": 0.72265625, "router_z_loss_mlp": 3.03125, "step": 81, "time_per_iteration": 2.637070894241333 }, { "auxiliary_loss_clip": 0.01490725, "auxiliary_loss_mlp": 0.01150368, "balance_loss_clip": 1.07927203, "balance_loss_mlp": 1.18020296, "epoch": 0.004930106718773485, "flos": 12822550237440.0, "grad_norm": 2.6227018187074114, "language_loss": 0.94184101, "learning_rate": 3.999771604809249e-06, "loss": 0.96825194, "num_input_tokens_seen": 1640715, "router_z_loss_clip": 0.7109375, "router_z_loss_mlp": 3.09375, "step": 82, "time_per_iteration": 2.5250260829925537 }, { "auxiliary_loss_clip": 0.01483064, "auxiliary_loss_mlp": 0.01149186, "balance_loss_clip": 1.07451355, "balance_loss_mlp": 1.17058671, "epoch": 0.004990229971441455, "flos": 25773756301440.0, "grad_norm": 2.157109061404475, "language_loss": 0.85192108, "learning_rate": 3.999765859354839e-06, "loss": 0.87824357, "num_input_tokens_seen": 1662210, "router_z_loss_clip": 0.74609375, "router_z_loss_mlp": 3.125, "step": 83, "time_per_iteration": 2.627007484436035 }, { "auxiliary_loss_clip": 0.0147668, "auxiliary_loss_mlp": 0.01154582, "balance_loss_clip": 1.07733428, "balance_loss_mlp": 1.16956425, "epoch": 0.005050353224109424, "flos": 17456292771840.0, "grad_norm": 2.7224666668920112, "language_loss": 0.90668142, "learning_rate": 3.999760042533931e-06, "loss": 0.93299407, "num_input_tokens_seen": 1681070, "router_z_loss_clip": 0.7734375, "router_z_loss_mlp": 3.078125, "step": 84, "time_per_iteration": 2.4960057735443115 }, { "auxiliary_loss_clip": 0.01597049, "auxiliary_loss_mlp": 0.01233618, "balance_loss_clip": 1.19604373, "balance_loss_mlp": 1.38286448, "epoch": 0.005110476476777394, "flos": 69802269235200.0, "grad_norm": 1.0747967940372232, "language_loss": 0.61884308, "learning_rate": 3.999754154346731e-06, "loss": 0.64714968, "num_input_tokens_seen": 1747140, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 2.140625, "step": 85, "time_per_iteration": 3.219843626022339 }, { "auxiliary_loss_clip": 0.01462354, "auxiliary_loss_mlp": 0.011176, "balance_loss_clip": 1.04264069, "balance_loss_mlp": 1.15787935, "epoch": 0.005170599729445363, "flos": 24788961181440.0, "grad_norm": 2.109722333055802, "language_loss": 0.89139509, "learning_rate": 3.999748194793449e-06, "loss": 0.9171946, "num_input_tokens_seen": 1767475, "router_z_loss_clip": 0.75, "router_z_loss_mlp": 3.046875, "step": 86, "time_per_iteration": 2.5646045207977295 }, { "auxiliary_loss_clip": 0.01472103, "auxiliary_loss_mlp": 0.01136714, "balance_loss_clip": 1.06227946, "balance_loss_mlp": 1.16680181, "epoch": 0.005230722982113333, "flos": 23256937411200.0, "grad_norm": 2.3874485555726417, "language_loss": 0.80222178, "learning_rate": 3.999742163874298e-06, "loss": 0.82830989, "num_input_tokens_seen": 1784980, "router_z_loss_clip": 0.7421875, "router_z_loss_mlp": 3.0625, "step": 87, "time_per_iteration": 2.568986177444458 }, { "auxiliary_loss_clip": 0.01458642, "auxiliary_loss_mlp": 0.01126635, "balance_loss_clip": 1.0532496, "balance_loss_mlp": 1.16615224, "epoch": 0.005290846234781301, "flos": 16726957136640.0, "grad_norm": 1.8012755792208095, "language_loss": 0.94010115, "learning_rate": 3.999736061589492e-06, "loss": 0.96595389, "num_input_tokens_seen": 1803030, "router_z_loss_clip": 0.734375, "router_z_loss_mlp": 2.9375, "step": 88, "time_per_iteration": 2.5278608798980713 }, { "auxiliary_loss_clip": 0.01458957, "auxiliary_loss_mlp": 0.01112408, "balance_loss_clip": 1.04107285, "balance_loss_mlp": 1.15566182, "epoch": 0.005350969487449271, "flos": 20885043041280.0, "grad_norm": 2.0496604196188133, "language_loss": 0.84080839, "learning_rate": 3.999729887939251e-06, "loss": 0.86652201, "num_input_tokens_seen": 1822865, "router_z_loss_clip": 0.71484375, "router_z_loss_mlp": 3.03125, "step": 89, "time_per_iteration": 2.563391923904419 }, { "auxiliary_loss_clip": 0.01449543, "auxiliary_loss_mlp": 0.01111738, "balance_loss_clip": 1.04006934, "balance_loss_mlp": 1.15215576, "epoch": 0.00541109274011724, "flos": 26208878976000.0, "grad_norm": 2.0047324414422962, "language_loss": 0.89549929, "learning_rate": 3.9997236429237945e-06, "loss": 0.92111206, "num_input_tokens_seen": 1842435, "router_z_loss_clip": 0.71875, "router_z_loss_mlp": 2.96875, "step": 90, "time_per_iteration": 2.5636699199676514 }, { "auxiliary_loss_clip": 0.01434598, "auxiliary_loss_mlp": 0.01115269, "balance_loss_clip": 1.04503119, "balance_loss_mlp": 1.14833903, "epoch": 0.00547121599278521, "flos": 21177510433920.0, "grad_norm": 3.218059252263368, "language_loss": 0.84463358, "learning_rate": 3.999717326543345e-06, "loss": 0.87013233, "num_input_tokens_seen": 1860065, "router_z_loss_clip": 0.703125, "router_z_loss_mlp": 2.859375, "step": 91, "time_per_iteration": 2.5507023334503174 }, { "auxiliary_loss_clip": 0.01435673, "auxiliary_loss_mlp": 0.01108879, "balance_loss_clip": 1.03992856, "balance_loss_mlp": 1.14235711, "epoch": 0.005531339245453179, "flos": 19717791822720.0, "grad_norm": 2.168802380764744, "language_loss": 0.85291636, "learning_rate": 3.9997109387981275e-06, "loss": 0.87836194, "num_input_tokens_seen": 1878135, "router_z_loss_clip": 0.6875, "router_z_loss_mlp": 2.9375, "step": 92, "time_per_iteration": 2.539388418197632 }, { "auxiliary_loss_clip": 0.01438361, "auxiliary_loss_mlp": 0.01139437, "balance_loss_clip": 1.06667149, "balance_loss_mlp": 1.14034557, "epoch": 0.005591462498121149, "flos": 17635222823040.0, "grad_norm": 2.7323537714010415, "language_loss": 0.89827538, "learning_rate": 3.99970447968837e-06, "loss": 0.92405343, "num_input_tokens_seen": 1894895, "router_z_loss_clip": 0.7265625, "router_z_loss_mlp": 2.984375, "step": 93, "time_per_iteration": 2.520329475402832 }, { "auxiliary_loss_clip": 0.01444438, "auxiliary_loss_mlp": 0.01130383, "balance_loss_clip": 1.05375481, "balance_loss_mlp": 1.14248276, "epoch": 0.005651585750789117, "flos": 20010189392640.0, "grad_norm": 3.029902315947606, "language_loss": 0.85671586, "learning_rate": 3.9996979492143045e-06, "loss": 0.88246405, "num_input_tokens_seen": 1913220, "router_z_loss_clip": 0.765625, "router_z_loss_mlp": 3.015625, "step": 94, "time_per_iteration": 2.518771171569824 }, { "auxiliary_loss_clip": 0.01454722, "auxiliary_loss_mlp": 0.01115659, "balance_loss_clip": 1.07884717, "balance_loss_mlp": 1.27629483, "epoch": 0.005711709003457087, "flos": 59809917185280.0, "grad_norm": 1.1802114848498737, "language_loss": 0.6768719, "learning_rate": 3.999691347376162e-06, "loss": 0.7025758, "num_input_tokens_seen": 1970970, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 1.78125, "step": 95, "time_per_iteration": 4.534137964248657 }, { "auxiliary_loss_clip": 0.01433724, "auxiliary_loss_mlp": 0.01147286, "balance_loss_clip": 1.07719111, "balance_loss_mlp": 1.137779, "epoch": 0.005771832256125056, "flos": 15558693488640.0, "grad_norm": 3.5253256287489165, "language_loss": 0.88525021, "learning_rate": 3.99968467417418e-06, "loss": 0.91106033, "num_input_tokens_seen": 1988930, "router_z_loss_clip": 0.703125, "router_z_loss_mlp": 2.953125, "step": 96, "time_per_iteration": 4.071526050567627 }, { "auxiliary_loss_clip": 0.01420394, "auxiliary_loss_mlp": 0.01132117, "balance_loss_clip": 1.06559837, "balance_loss_mlp": 1.1362958, "epoch": 0.005831955508793026, "flos": 22527287573760.0, "grad_norm": 2.891057314816126, "language_loss": 0.88203895, "learning_rate": 3.999677929608596e-06, "loss": 0.90756404, "num_input_tokens_seen": 2006285, "router_z_loss_clip": 0.66796875, "router_z_loss_mlp": 2.84375, "step": 97, "time_per_iteration": 4.145650863647461 }, { "auxiliary_loss_clip": 0.01413049, "auxiliary_loss_mlp": 0.01130992, "balance_loss_clip": 1.0612781, "balance_loss_mlp": 1.13330138, "epoch": 0.005892078761460995, "flos": 22048872946560.0, "grad_norm": 2.0178304849889797, "language_loss": 0.75365317, "learning_rate": 3.99967111367965e-06, "loss": 0.77909356, "num_input_tokens_seen": 2024905, "router_z_loss_clip": 0.69921875, "router_z_loss_mlp": 2.796875, "step": 98, "time_per_iteration": 2.5823192596435547 }, { "auxiliary_loss_clip": 0.01407603, "auxiliary_loss_mlp": 0.0104898, "balance_loss_clip": 1.01293111, "balance_loss_mlp": 1.23561692, "epoch": 0.005952202014128965, "flos": 68535689598720.0, "grad_norm": 0.963339581518281, "language_loss": 0.65151054, "learning_rate": 3.999664226387586e-06, "loss": 0.67607635, "num_input_tokens_seen": 2086220, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 1.71875, "step": 99, "time_per_iteration": 3.2279751300811768 }, { "auxiliary_loss_clip": 0.01421633, "auxiliary_loss_mlp": 0.01152154, "balance_loss_clip": 1.07628894, "balance_loss_mlp": 1.13284802, "epoch": 0.006012325266796933, "flos": 22959931541760.0, "grad_norm": 2.089015714107569, "language_loss": 0.8961187, "learning_rate": 3.999657267732648e-06, "loss": 0.92185652, "num_input_tokens_seen": 2103365, "router_z_loss_clip": 0.7578125, "router_z_loss_mlp": 2.890625, "step": 100, "time_per_iteration": 2.543591022491455 }, { "auxiliary_loss_clip": 0.01423763, "auxiliary_loss_mlp": 0.01135114, "balance_loss_clip": 1.06478059, "balance_loss_mlp": 1.13237977, "epoch": 0.006072448519464903, "flos": 17346979704960.0, "grad_norm": 2.0200724237283785, "language_loss": 0.89709979, "learning_rate": 3.999650237715088e-06, "loss": 0.92268854, "num_input_tokens_seen": 2121995, "router_z_loss_clip": 0.703125, "router_z_loss_mlp": 2.90625, "step": 101, "time_per_iteration": 2.6019370555877686 }, { "auxiliary_loss_clip": 0.01417283, "auxiliary_loss_mlp": 0.01138533, "balance_loss_clip": 1.06633949, "balance_loss_mlp": 1.13459754, "epoch": 0.006132571772132872, "flos": 24679962316800.0, "grad_norm": 2.382432542633007, "language_loss": 0.89427447, "learning_rate": 3.9996431363351536e-06, "loss": 0.91983271, "num_input_tokens_seen": 2141815, "router_z_loss_clip": 0.72265625, "router_z_loss_mlp": 2.828125, "step": 102, "time_per_iteration": 2.55599045753479 }, { "auxiliary_loss_clip": 0.01412183, "auxiliary_loss_mlp": 0.01121469, "balance_loss_clip": 1.05390084, "balance_loss_mlp": 1.13109529, "epoch": 0.006192695024800842, "flos": 21464741122560.0, "grad_norm": 2.1856308121545287, "language_loss": 0.86522692, "learning_rate": 3.9996359635931e-06, "loss": 0.89056349, "num_input_tokens_seen": 2161125, "router_z_loss_clip": 0.67578125, "router_z_loss_mlp": 2.8125, "step": 103, "time_per_iteration": 2.555086374282837 }, { "auxiliary_loss_clip": 0.01414161, "auxiliary_loss_mlp": 0.01133673, "balance_loss_clip": 1.06314886, "balance_loss_mlp": 1.13492, "epoch": 0.006252818277468811, "flos": 17419459420800.0, "grad_norm": 2.0955448234560823, "language_loss": 0.92823404, "learning_rate": 3.999628719489181e-06, "loss": 0.9537124, "num_input_tokens_seen": 2179510, "router_z_loss_clip": 0.70703125, "router_z_loss_mlp": 2.78125, "step": 104, "time_per_iteration": 2.5785515308380127 }, { "auxiliary_loss_clip": 0.01407541, "auxiliary_loss_mlp": 0.01126941, "balance_loss_clip": 1.05708396, "balance_loss_mlp": 1.12816191, "epoch": 0.006312941530136781, "flos": 19098537304320.0, "grad_norm": 14.34565848265555, "language_loss": 0.94836128, "learning_rate": 3.999621404023658e-06, "loss": 0.97370607, "num_input_tokens_seen": 2197870, "router_z_loss_clip": 0.69921875, "router_z_loss_mlp": 2.796875, "step": 105, "time_per_iteration": 2.543741226196289 }, { "auxiliary_loss_clip": 0.01410998, "auxiliary_loss_mlp": 0.01137532, "balance_loss_clip": 1.06157172, "balance_loss_mlp": 1.12886274, "epoch": 0.006373064782804749, "flos": 24059695368960.0, "grad_norm": 2.226532575855215, "language_loss": 0.85008109, "learning_rate": 3.9996140171967904e-06, "loss": 0.87556636, "num_input_tokens_seen": 2217495, "router_z_loss_clip": 0.7578125, "router_z_loss_mlp": 2.8125, "step": 106, "time_per_iteration": 2.576078414916992 }, { "auxiliary_loss_clip": 0.01402094, "auxiliary_loss_mlp": 0.01119371, "balance_loss_clip": 1.04894197, "balance_loss_mlp": 1.12047982, "epoch": 0.006433188035472719, "flos": 18331460622720.0, "grad_norm": 2.3678014845405873, "language_loss": 0.81457663, "learning_rate": 3.9996065590088426e-06, "loss": 0.8397913, "num_input_tokens_seen": 2236520, "router_z_loss_clip": 0.703125, "router_z_loss_mlp": 2.8125, "step": 107, "time_per_iteration": 2.497641086578369 }, { "auxiliary_loss_clip": 0.01341496, "auxiliary_loss_mlp": 0.0110217, "balance_loss_clip": 1.07508552, "balance_loss_mlp": 1.18681765, "epoch": 0.006493311288140688, "flos": 62558907816960.0, "grad_norm": 0.9461185842272075, "language_loss": 0.64579511, "learning_rate": 3.999599029460081e-06, "loss": 0.67023176, "num_input_tokens_seen": 2300140, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.546875, "step": 108, "time_per_iteration": 3.157834053039551 }, { "auxiliary_loss_clip": 0.01398715, "auxiliary_loss_mlp": 0.01113337, "balance_loss_clip": 1.04224062, "balance_loss_mlp": 1.12512159, "epoch": 0.006553434540808658, "flos": 19499130777600.0, "grad_norm": 2.0634755752913914, "language_loss": 0.96250588, "learning_rate": 3.999591428550772e-06, "loss": 0.98762637, "num_input_tokens_seen": 2317320, "router_z_loss_clip": 0.7109375, "router_z_loss_mlp": 2.734375, "step": 109, "time_per_iteration": 2.527080535888672 }, { "auxiliary_loss_clip": 0.01397787, "auxiliary_loss_mlp": 0.01115048, "balance_loss_clip": 1.04652655, "balance_loss_mlp": 1.12549496, "epoch": 0.006613557793476627, "flos": 21104088111360.0, "grad_norm": 1.7137254663805555, "language_loss": 0.83182019, "learning_rate": 3.999583756281189e-06, "loss": 0.85694849, "num_input_tokens_seen": 2337820, "router_z_loss_clip": 0.6875, "router_z_loss_mlp": 2.71875, "step": 110, "time_per_iteration": 2.596374273300171 }, { "auxiliary_loss_clip": 0.01399336, "auxiliary_loss_mlp": 0.01111445, "balance_loss_clip": 1.04158795, "balance_loss_mlp": 1.12313271, "epoch": 0.006673681046144597, "flos": 26029564899840.0, "grad_norm": 2.0155584080496984, "language_loss": 0.81827509, "learning_rate": 3.999576012651605e-06, "loss": 0.84338289, "num_input_tokens_seen": 2358560, "router_z_loss_clip": 0.6953125, "router_z_loss_mlp": 2.75, "step": 111, "time_per_iteration": 2.590143918991089 }, { "auxiliary_loss_clip": 0.01389427, "auxiliary_loss_mlp": 0.0111467, "balance_loss_clip": 1.04719806, "balance_loss_mlp": 1.11860859, "epoch": 0.006733804298812566, "flos": 23146681737600.0, "grad_norm": 2.370401313774829, "language_loss": 0.92597079, "learning_rate": 3.999568197662297e-06, "loss": 0.95101178, "num_input_tokens_seen": 2379005, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 2.703125, "step": 112, "time_per_iteration": 2.553640127182007 }, { "auxiliary_loss_clip": 0.01399439, "auxiliary_loss_mlp": 0.01106971, "balance_loss_clip": 1.04140568, "balance_loss_mlp": 1.12583899, "epoch": 0.006793927551480535, "flos": 11763669479040.0, "grad_norm": 2.2048966935764076, "language_loss": 0.77447867, "learning_rate": 3.999560311313543e-06, "loss": 0.79954273, "num_input_tokens_seen": 2395610, "router_z_loss_clip": 0.65625, "router_z_loss_mlp": 2.734375, "step": 113, "time_per_iteration": 2.5330731868743896 }, { "auxiliary_loss_clip": 0.01391044, "auxiliary_loss_mlp": 0.01108191, "balance_loss_clip": 1.03609276, "balance_loss_mlp": 1.12234282, "epoch": 0.006854050804148504, "flos": 19170947197440.0, "grad_norm": 1.788522558774708, "language_loss": 0.91970974, "learning_rate": 3.999552353605626e-06, "loss": 0.94470197, "num_input_tokens_seen": 2415005, "router_z_loss_clip": 0.72265625, "router_z_loss_mlp": 2.6875, "step": 114, "time_per_iteration": 2.5473427772521973 }, { "auxiliary_loss_clip": 0.0139664, "auxiliary_loss_mlp": 0.01111945, "balance_loss_clip": 1.04928839, "balance_loss_mlp": 1.11947298, "epoch": 0.006914174056816474, "flos": 21980792062080.0, "grad_norm": 2.5305389445574975, "language_loss": 0.93227071, "learning_rate": 3.999544324538829e-06, "loss": 0.95735657, "num_input_tokens_seen": 2433965, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 2.765625, "step": 115, "time_per_iteration": 2.5604898929595947 }, { "auxiliary_loss_clip": 0.01389107, "auxiliary_loss_mlp": 0.01104315, "balance_loss_clip": 1.03507793, "balance_loss_mlp": 1.11516285, "epoch": 0.006974297309484443, "flos": 16288238592000.0, "grad_norm": 2.5654792929808563, "language_loss": 0.80363703, "learning_rate": 3.999536224113438e-06, "loss": 0.82857126, "num_input_tokens_seen": 2451605, "router_z_loss_clip": 0.69140625, "router_z_loss_mlp": 2.734375, "step": 116, "time_per_iteration": 2.5479090213775635 }, { "auxiliary_loss_clip": 0.01381443, "auxiliary_loss_mlp": 0.01104956, "balance_loss_clip": 1.0391047, "balance_loss_mlp": 1.11800122, "epoch": 0.007034420562152413, "flos": 26102812665600.0, "grad_norm": 3.7731086756359464, "language_loss": 0.86852342, "learning_rate": 3.9995280523297416e-06, "loss": 0.89338744, "num_input_tokens_seen": 2472035, "router_z_loss_clip": 0.65625, "router_z_loss_mlp": 2.625, "step": 117, "time_per_iteration": 2.556678056716919 }, { "auxiliary_loss_clip": 0.01386747, "auxiliary_loss_mlp": 0.0111943, "balance_loss_clip": 1.04842925, "balance_loss_mlp": 1.12321186, "epoch": 0.007094543814820382, "flos": 14203889112960.0, "grad_norm": 2.1802516405551553, "language_loss": 0.82808697, "learning_rate": 3.9995198091880334e-06, "loss": 0.8531487, "num_input_tokens_seen": 2489285, "router_z_loss_clip": 0.7109375, "router_z_loss_mlp": 2.640625, "step": 118, "time_per_iteration": 2.588897705078125 }, { "auxiliary_loss_clip": 0.01394154, "auxiliary_loss_mlp": 0.01123586, "balance_loss_clip": 1.05520773, "balance_loss_mlp": 1.11883044, "epoch": 0.007154667067488351, "flos": 14975120246400.0, "grad_norm": 2.874811437563815, "language_loss": 0.97947919, "learning_rate": 3.999511494688606e-06, "loss": 1.00465655, "num_input_tokens_seen": 2506460, "router_z_loss_clip": 0.68359375, "router_z_loss_mlp": 2.75, "step": 119, "time_per_iteration": 2.5415806770324707 }, { "auxiliary_loss_clip": 0.01395516, "auxiliary_loss_mlp": 0.01106237, "balance_loss_clip": 1.03919351, "balance_loss_mlp": 1.1161859, "epoch": 0.00721479032015632, "flos": 20192261466240.0, "grad_norm": 2.3987781398524306, "language_loss": 0.87784606, "learning_rate": 3.999503108831758e-06, "loss": 0.90286356, "num_input_tokens_seen": 2525565, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 2.796875, "step": 120, "time_per_iteration": 2.533524513244629 }, { "auxiliary_loss_clip": 0.01384108, "auxiliary_loss_mlp": 0.01112804, "balance_loss_clip": 1.04556966, "balance_loss_mlp": 1.11985481, "epoch": 0.00727491357282429, "flos": 23146158067200.0, "grad_norm": 1.8486992631891273, "language_loss": 0.92068368, "learning_rate": 3.999494651617787e-06, "loss": 0.94565284, "num_input_tokens_seen": 2546605, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 2.640625, "step": 121, "time_per_iteration": 2.5171191692352295 }, { "auxiliary_loss_clip": 0.01389371, "auxiliary_loss_mlp": 0.01132887, "balance_loss_clip": 1.06698799, "balance_loss_mlp": 1.12244916, "epoch": 0.007335036825492259, "flos": 15520812796800.0, "grad_norm": 2.3026404931976407, "language_loss": 0.88909745, "learning_rate": 3.999486123046994e-06, "loss": 0.91432005, "num_input_tokens_seen": 2560730, "router_z_loss_clip": 0.65625, "router_z_loss_mlp": 2.671875, "step": 122, "time_per_iteration": 2.5436108112335205 }, { "auxiliary_loss_clip": 0.01383648, "auxiliary_loss_mlp": 0.01112574, "balance_loss_clip": 1.04171658, "balance_loss_mlp": 1.11539435, "epoch": 0.007395160078160229, "flos": 24242221290240.0, "grad_norm": 3.7241053052078352, "language_loss": 0.91549945, "learning_rate": 3.999477523119686e-06, "loss": 0.94046164, "num_input_tokens_seen": 2579550, "router_z_loss_clip": 0.7109375, "router_z_loss_mlp": 2.6875, "step": 123, "time_per_iteration": 2.5250282287597656 }, { "auxiliary_loss_clip": 0.01382203, "auxiliary_loss_mlp": 0.01103122, "balance_loss_clip": 1.03736663, "balance_loss_mlp": 1.11126471, "epoch": 0.007455283330828198, "flos": 31758428050560.0, "grad_norm": 5.948156608363036, "language_loss": 0.69979113, "learning_rate": 3.999468851836168e-06, "loss": 0.72464442, "num_input_tokens_seen": 2600390, "router_z_loss_clip": 0.65625, "router_z_loss_mlp": 2.71875, "step": 124, "time_per_iteration": 2.675476551055908 }, { "auxiliary_loss_clip": 0.01374967, "auxiliary_loss_mlp": 0.01109007, "balance_loss_clip": 1.04382336, "balance_loss_mlp": 1.11420441, "epoch": 0.007515406583496167, "flos": 26613941103360.0, "grad_norm": 2.139508831646684, "language_loss": 0.87107795, "learning_rate": 3.999460109196749e-06, "loss": 0.89591765, "num_input_tokens_seen": 2620770, "router_z_loss_clip": 0.65234375, "router_z_loss_mlp": 2.609375, "step": 125, "time_per_iteration": 2.5488245487213135 }, { "auxiliary_loss_clip": 0.01384091, "auxiliary_loss_mlp": 0.01117652, "balance_loss_clip": 1.04979801, "balance_loss_mlp": 1.1116432, "epoch": 0.007575529836164136, "flos": 18222706137600.0, "grad_norm": 2.6663440653428, "language_loss": 0.80915189, "learning_rate": 3.999451295201743e-06, "loss": 0.83416933, "num_input_tokens_seen": 2639900, "router_z_loss_clip": 0.6796875, "router_z_loss_mlp": 2.71875, "step": 126, "time_per_iteration": 2.552436590194702 }, { "auxiliary_loss_clip": 0.01383761, "auxiliary_loss_mlp": 0.01104154, "balance_loss_clip": 1.03906584, "balance_loss_mlp": 1.11447549, "epoch": 0.007635653088832106, "flos": 21579325804800.0, "grad_norm": 2.829396146717874, "language_loss": 0.66536784, "learning_rate": 3.999442409851463e-06, "loss": 0.690247, "num_input_tokens_seen": 2657450, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 2.6875, "step": 127, "time_per_iteration": 2.5131595134735107 }, { "auxiliary_loss_clip": 0.01375587, "auxiliary_loss_mlp": 0.0110898, "balance_loss_clip": 1.04360604, "balance_loss_mlp": 1.1157546, "epoch": 0.007695776341500075, "flos": 25373861055360.0, "grad_norm": 3.1321009293481024, "language_loss": 0.86887217, "learning_rate": 3.999433453146227e-06, "loss": 0.89371789, "num_input_tokens_seen": 2678150, "router_z_loss_clip": 0.65234375, "router_z_loss_mlp": 2.59375, "step": 128, "time_per_iteration": 2.658592700958252 }, { "auxiliary_loss_clip": 0.01373952, "auxiliary_loss_mlp": 0.01117951, "balance_loss_clip": 1.04957247, "balance_loss_mlp": 1.10953844, "epoch": 0.007755899594168045, "flos": 22342876439040.0, "grad_norm": 1.8980833921998537, "language_loss": 0.83853519, "learning_rate": 3.9994244250863535e-06, "loss": 0.86345422, "num_input_tokens_seen": 2698290, "router_z_loss_clip": 0.68359375, "router_z_loss_mlp": 2.640625, "step": 129, "time_per_iteration": 2.5144636631011963 }, { "auxiliary_loss_clip": 0.01367964, "auxiliary_loss_mlp": 0.01104102, "balance_loss_clip": 1.03801274, "balance_loss_mlp": 1.10930061, "epoch": 0.007816022846836013, "flos": 22637124311040.0, "grad_norm": 2.1901717057431904, "language_loss": 0.96096313, "learning_rate": 3.999415325672166e-06, "loss": 0.98568374, "num_input_tokens_seen": 2717630, "router_z_loss_clip": 0.66015625, "router_z_loss_mlp": 2.578125, "step": 130, "time_per_iteration": 2.513084888458252 }, { "auxiliary_loss_clip": 0.01366017, "auxiliary_loss_mlp": 0.0110484, "balance_loss_clip": 1.04018116, "balance_loss_mlp": 1.11058915, "epoch": 0.007876146099503984, "flos": 18182032536960.0, "grad_norm": 2.2948658674142837, "language_loss": 0.80744946, "learning_rate": 3.9994061549039886e-06, "loss": 0.83215797, "num_input_tokens_seen": 2735835, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 2.5625, "step": 131, "time_per_iteration": 2.493565797805786 }, { "auxiliary_loss_clip": 0.01374832, "auxiliary_loss_mlp": 0.0110615, "balance_loss_clip": 1.04330242, "balance_loss_mlp": 1.11081851, "epoch": 0.007936269352171952, "flos": 27118436382720.0, "grad_norm": 2.7280402886740327, "language_loss": 0.82224703, "learning_rate": 3.9993969127821485e-06, "loss": 0.84705681, "num_input_tokens_seen": 2756335, "router_z_loss_clip": 0.62890625, "router_z_loss_mlp": 2.640625, "step": 132, "time_per_iteration": 2.5553696155548096 }, { "auxiliary_loss_clip": 0.01363389, "auxiliary_loss_mlp": 0.01108456, "balance_loss_clip": 1.04355907, "balance_loss_mlp": 1.10452795, "epoch": 0.007996392604839923, "flos": 19025324449920.0, "grad_norm": 1.8894809484000292, "language_loss": 0.94121611, "learning_rate": 3.9993875993069755e-06, "loss": 0.96593451, "num_input_tokens_seen": 2775090, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 2.59375, "step": 133, "time_per_iteration": 2.4987971782684326 }, { "auxiliary_loss_clip": 0.01359644, "auxiliary_loss_mlp": 0.01107564, "balance_loss_clip": 1.04538476, "balance_loss_mlp": 1.11143434, "epoch": 0.008056515857507891, "flos": 25482964654080.0, "grad_norm": 1.743450272302483, "language_loss": 0.72378039, "learning_rate": 3.9993782144788025e-06, "loss": 0.74845243, "num_input_tokens_seen": 2795320, "router_z_loss_clip": 0.62109375, "router_z_loss_mlp": 2.484375, "step": 134, "time_per_iteration": 3.978874683380127 }, { "auxiliary_loss_clip": 0.0136099, "auxiliary_loss_mlp": 0.01107337, "balance_loss_clip": 1.04010284, "balance_loss_mlp": 1.10283303, "epoch": 0.00811663911017586, "flos": 20556545258880.0, "grad_norm": 4.696246405269054, "language_loss": 0.87248123, "learning_rate": 3.999368758297964e-06, "loss": 0.89716446, "num_input_tokens_seen": 2812815, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 2.578125, "step": 135, "time_per_iteration": 5.2392988204956055 }, { "auxiliary_loss_clip": 0.01362226, "auxiliary_loss_mlp": 0.01106394, "balance_loss_clip": 1.03935051, "balance_loss_mlp": 1.10586715, "epoch": 0.00817676236284383, "flos": 18798947994240.0, "grad_norm": 1.8825613314785814, "language_loss": 0.87810934, "learning_rate": 3.999359230764798e-06, "loss": 0.90279549, "num_input_tokens_seen": 2830445, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 2.5625, "step": 136, "time_per_iteration": 3.84043025970459 }, { "auxiliary_loss_clip": 0.01359523, "auxiliary_loss_mlp": 0.01091745, "balance_loss_clip": 1.0304718, "balance_loss_mlp": 1.10392106, "epoch": 0.008236885615511799, "flos": 23872596059520.0, "grad_norm": 2.1865065382853994, "language_loss": 0.82736731, "learning_rate": 3.999349631879643e-06, "loss": 0.85187995, "num_input_tokens_seen": 2846965, "router_z_loss_clip": 0.61328125, "router_z_loss_mlp": 2.5625, "step": 137, "time_per_iteration": 2.4926602840423584 }, { "auxiliary_loss_clip": 0.01355187, "auxiliary_loss_mlp": 0.01092266, "balance_loss_clip": 1.03099298, "balance_loss_mlp": 1.10456729, "epoch": 0.00829700886817977, "flos": 24642500561280.0, "grad_norm": 1.8798885548199515, "language_loss": 0.8933351, "learning_rate": 3.9993399616428425e-06, "loss": 0.91780961, "num_input_tokens_seen": 2867520, "router_z_loss_clip": 0.61328125, "router_z_loss_mlp": 2.5, "step": 138, "time_per_iteration": 2.527296543121338 }, { "auxiliary_loss_clip": 0.01346701, "auxiliary_loss_mlp": 0.01092938, "balance_loss_clip": 1.02894676, "balance_loss_mlp": 1.1021167, "epoch": 0.008357132120847738, "flos": 25260917207040.0, "grad_norm": 2.235375654561318, "language_loss": 0.9069339, "learning_rate": 3.999330220054742e-06, "loss": 0.93133026, "num_input_tokens_seen": 2885675, "router_z_loss_clip": 0.640625, "router_z_loss_mlp": 2.4375, "step": 139, "time_per_iteration": 2.5068202018737793 }, { "auxiliary_loss_clip": 0.01364767, "auxiliary_loss_mlp": 0.01099099, "balance_loss_clip": 1.03525043, "balance_loss_mlp": 1.10740197, "epoch": 0.008417255373515706, "flos": 27343660763520.0, "grad_norm": 2.359048254552134, "language_loss": 0.84722054, "learning_rate": 3.9993204071156894e-06, "loss": 0.87185919, "num_input_tokens_seen": 2905960, "router_z_loss_clip": 0.63671875, "router_z_loss_mlp": 2.5625, "step": 140, "time_per_iteration": 2.596464157104492 }, { "auxiliary_loss_clip": 0.01350993, "auxiliary_loss_mlp": 0.01098609, "balance_loss_clip": 1.03685844, "balance_loss_mlp": 1.10390186, "epoch": 0.008477378626183677, "flos": 17638120465920.0, "grad_norm": 7.5568190931205255, "language_loss": 0.8278836, "learning_rate": 3.999310522826034e-06, "loss": 0.85237962, "num_input_tokens_seen": 2922780, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 2.46875, "step": 141, "time_per_iteration": 2.5674993991851807 }, { "auxiliary_loss_clip": 0.01353554, "auxiliary_loss_mlp": 0.01096234, "balance_loss_clip": 1.0332911, "balance_loss_mlp": 1.10089064, "epoch": 0.008537501878851645, "flos": 13880488389120.0, "grad_norm": 2.45829146302685, "language_loss": 0.79976183, "learning_rate": 3.999300567186129e-06, "loss": 0.8242597, "num_input_tokens_seen": 2938765, "router_z_loss_clip": 0.62890625, "router_z_loss_mlp": 2.53125, "step": 142, "time_per_iteration": 2.4487762451171875 }, { "auxiliary_loss_clip": 0.01293382, "auxiliary_loss_mlp": 0.01083486, "balance_loss_clip": 1.05849934, "balance_loss_mlp": 1.14034057, "epoch": 0.008597625131519616, "flos": 71244320832000.0, "grad_norm": 1.0288296118365807, "language_loss": 0.66773266, "learning_rate": 3.999290540196329e-06, "loss": 0.69150138, "num_input_tokens_seen": 3006665, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 1.53125, "step": 143, "time_per_iteration": 3.269559144973755 }, { "auxiliary_loss_clip": 0.0135308, "auxiliary_loss_mlp": 0.01103362, "balance_loss_clip": 1.04156375, "balance_loss_mlp": 1.10496008, "epoch": 0.008657748384187584, "flos": 17601880608000.0, "grad_norm": 2.110396981770736, "language_loss": 0.83264089, "learning_rate": 3.999280441856992e-06, "loss": 0.85720533, "num_input_tokens_seen": 3024335, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 2.46875, "step": 144, "time_per_iteration": 2.4857771396636963 }, { "auxiliary_loss_clip": 0.01341317, "auxiliary_loss_mlp": 0.01097107, "balance_loss_clip": 1.03240073, "balance_loss_mlp": 1.09779024, "epoch": 0.008717871636855555, "flos": 19714405420800.0, "grad_norm": 2.0613967466713885, "language_loss": 0.87342119, "learning_rate": 3.9992702721684805e-06, "loss": 0.89780545, "num_input_tokens_seen": 3043300, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 2.4375, "step": 145, "time_per_iteration": 2.5223348140716553 }, { "auxiliary_loss_clip": 0.01353492, "auxiliary_loss_mlp": 0.01098152, "balance_loss_clip": 1.03301656, "balance_loss_mlp": 1.0989728, "epoch": 0.008777994889523523, "flos": 24716271997440.0, "grad_norm": 1.8762206084414534, "language_loss": 0.85667378, "learning_rate": 3.999260031131154e-06, "loss": 0.88119018, "num_input_tokens_seen": 3064610, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 2.546875, "step": 146, "time_per_iteration": 2.530491828918457 }, { "auxiliary_loss_clip": 0.0127963, "auxiliary_loss_mlp": 0.01026299, "balance_loss_clip": 1.00312459, "balance_loss_mlp": 1.14265537, "epoch": 0.008838118142191492, "flos": 70128916715520.0, "grad_norm": 0.8141797365584135, "language_loss": 0.59914416, "learning_rate": 3.999249718745381e-06, "loss": 0.62220341, "num_input_tokens_seen": 3130385, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 1.375, "step": 147, "time_per_iteration": 3.2129898071289062 }, { "auxiliary_loss_clip": 0.01345115, "auxiliary_loss_mlp": 0.01096029, "balance_loss_clip": 1.03628087, "balance_loss_mlp": 1.10191417, "epoch": 0.008898241394859462, "flos": 20043845809920.0, "grad_norm": 1.8636165565969143, "language_loss": 0.83679867, "learning_rate": 3.999239335011527e-06, "loss": 0.86121005, "num_input_tokens_seen": 3149760, "router_z_loss_clip": 0.59765625, "router_z_loss_mlp": 2.4375, "step": 148, "time_per_iteration": 2.566246747970581 }, { "auxiliary_loss_clip": 0.01350047, "auxiliary_loss_mlp": 0.01117175, "balance_loss_clip": 1.05318344, "balance_loss_mlp": 1.09972465, "epoch": 0.008958364647527431, "flos": 10742843969280.0, "grad_norm": 2.214139356241396, "language_loss": 0.87434661, "learning_rate": 3.999228879929965e-06, "loss": 0.89901882, "num_input_tokens_seen": 3164500, "router_z_loss_clip": 0.640625, "router_z_loss_mlp": 2.5, "step": 149, "time_per_iteration": 2.482739210128784 }, { "auxiliary_loss_clip": 0.01352019, "auxiliary_loss_mlp": 0.01106148, "balance_loss_clip": 1.04434991, "balance_loss_mlp": 1.10022926, "epoch": 0.009018487900195401, "flos": 29126326250880.0, "grad_norm": 2.052042896827704, "language_loss": 0.92434806, "learning_rate": 3.999218353501066e-06, "loss": 0.94892967, "num_input_tokens_seen": 3182455, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 2.515625, "step": 150, "time_per_iteration": 2.5825510025024414 }, { "auxiliary_loss_clip": 0.01343467, "auxiliary_loss_mlp": 0.01100108, "balance_loss_clip": 1.03931189, "balance_loss_mlp": 1.09753084, "epoch": 0.00907861115286337, "flos": 32962268240640.0, "grad_norm": 2.0323430661128237, "language_loss": 0.73467743, "learning_rate": 3.999207755725208e-06, "loss": 0.75911319, "num_input_tokens_seen": 3203995, "router_z_loss_clip": 0.609375, "router_z_loss_mlp": 2.453125, "step": 151, "time_per_iteration": 2.621089458465576 }, { "auxiliary_loss_clip": 0.01348765, "auxiliary_loss_mlp": 0.01104451, "balance_loss_clip": 1.0417949, "balance_loss_mlp": 1.09938061, "epoch": 0.009138734405531338, "flos": 21761362967040.0, "grad_norm": 2.3198228318377594, "language_loss": 0.87541401, "learning_rate": 3.999197086602766e-06, "loss": 0.89994621, "num_input_tokens_seen": 3222575, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 2.484375, "step": 152, "time_per_iteration": 2.5434858798980713 }, { "auxiliary_loss_clip": 0.01342767, "auxiliary_loss_mlp": 0.01094005, "balance_loss_clip": 1.03306484, "balance_loss_mlp": 1.10060859, "epoch": 0.009198857658199309, "flos": 20841681265920.0, "grad_norm": 3.727614359588284, "language_loss": 0.8170656, "learning_rate": 3.9991863461341234e-06, "loss": 0.84143329, "num_input_tokens_seen": 3240180, "router_z_loss_clip": 0.609375, "router_z_loss_mlp": 2.421875, "step": 153, "time_per_iteration": 2.5529229640960693 }, { "auxiliary_loss_clip": 0.01340483, "auxiliary_loss_mlp": 0.01101178, "balance_loss_clip": 1.04081035, "balance_loss_mlp": 1.09398246, "epoch": 0.009258980910867277, "flos": 24826213468800.0, "grad_norm": 2.0004419209962077, "language_loss": 0.88920546, "learning_rate": 3.999175534319662e-06, "loss": 0.91362202, "num_input_tokens_seen": 3259800, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 2.46875, "step": 154, "time_per_iteration": 2.498523712158203 }, { "auxiliary_loss_clip": 0.0134595, "auxiliary_loss_mlp": 0.01134348, "balance_loss_clip": 1.07116711, "balance_loss_mlp": 1.09875727, "epoch": 0.009319104163535248, "flos": 28766511112320.0, "grad_norm": 2.1190994788647393, "language_loss": 0.88754398, "learning_rate": 3.999164651159769e-06, "loss": 0.91234696, "num_input_tokens_seen": 3280400, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 2.46875, "step": 155, "time_per_iteration": 2.5487866401672363 }, { "auxiliary_loss_clip": 0.01347773, "auxiliary_loss_mlp": 0.01106749, "balance_loss_clip": 1.04685855, "balance_loss_mlp": 1.09622169, "epoch": 0.009379227416203216, "flos": 16581055098240.0, "grad_norm": 2.63691628375773, "language_loss": 0.85156655, "learning_rate": 3.999153696654832e-06, "loss": 0.87611175, "num_input_tokens_seen": 3297600, "router_z_loss_clip": 0.59765625, "router_z_loss_mlp": 2.515625, "step": 156, "time_per_iteration": 2.453019380569458 }, { "auxiliary_loss_clip": 0.01344155, "auxiliary_loss_mlp": 0.01096222, "balance_loss_clip": 1.03523493, "balance_loss_mlp": 1.0993228, "epoch": 0.009439350668871187, "flos": 18329016827520.0, "grad_norm": 2.1940432790252355, "language_loss": 0.98876888, "learning_rate": 3.9991426708052416e-06, "loss": 1.01317263, "num_input_tokens_seen": 3313635, "router_z_loss_clip": 0.609375, "router_z_loss_mlp": 2.4375, "step": 157, "time_per_iteration": 2.4774229526519775 }, { "auxiliary_loss_clip": 0.01336484, "auxiliary_loss_mlp": 0.01122648, "balance_loss_clip": 1.06270957, "balance_loss_mlp": 1.09593248, "epoch": 0.009499473921539155, "flos": 24348846182400.0, "grad_norm": 4.935499386420861, "language_loss": 0.87422907, "learning_rate": 3.999131573611392e-06, "loss": 0.8988204, "num_input_tokens_seen": 3333735, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 2.40625, "step": 158, "time_per_iteration": 2.5238518714904785 }, { "auxiliary_loss_clip": 0.01340118, "auxiliary_loss_mlp": 0.01117905, "balance_loss_clip": 1.05481958, "balance_loss_mlp": 1.09583664, "epoch": 0.009559597174207124, "flos": 16398389531520.0, "grad_norm": 2.8192939651199223, "language_loss": 0.85176593, "learning_rate": 3.999120405073679e-06, "loss": 0.87634623, "num_input_tokens_seen": 3348800, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 2.4375, "step": 159, "time_per_iteration": 2.479688882827759 }, { "auxiliary_loss_clip": 0.0133384, "auxiliary_loss_mlp": 0.01106715, "balance_loss_clip": 1.04439223, "balance_loss_mlp": 1.09363759, "epoch": 0.009619720426875094, "flos": 22855785356160.0, "grad_norm": 1.9515699711388057, "language_loss": 0.85391754, "learning_rate": 3.9991091651925014e-06, "loss": 0.87832308, "num_input_tokens_seen": 3368595, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 2.40625, "step": 160, "time_per_iteration": 2.4945571422576904 }, { "auxiliary_loss_clip": 0.01338913, "auxiliary_loss_mlp": 0.01099574, "balance_loss_clip": 1.04101801, "balance_loss_mlp": 1.09515488, "epoch": 0.009679843679543063, "flos": 19134009112320.0, "grad_norm": 2.5362498189638, "language_loss": 0.90829933, "learning_rate": 3.999097853968259e-06, "loss": 0.93268418, "num_input_tokens_seen": 3384975, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 2.4375, "step": 161, "time_per_iteration": 2.4981765747070312 }, { "auxiliary_loss_clip": 0.01340961, "auxiliary_loss_mlp": 0.01090779, "balance_loss_clip": 1.03322458, "balance_loss_mlp": 1.09601188, "epoch": 0.009739966932211033, "flos": 20301958558080.0, "grad_norm": 2.448191667557215, "language_loss": 0.90737391, "learning_rate": 3.999086471401357e-06, "loss": 0.93169141, "num_input_tokens_seen": 3404755, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 2.453125, "step": 162, "time_per_iteration": 2.5141706466674805 }, { "auxiliary_loss_clip": 0.01249292, "auxiliary_loss_mlp": 0.01097223, "balance_loss_clip": 1.0758605, "balance_loss_mlp": 1.11713386, "epoch": 0.009800090184879002, "flos": 67031073112320.0, "grad_norm": 1.1788821400625664, "language_loss": 0.72151339, "learning_rate": 3.9990750174922005e-06, "loss": 0.74497843, "num_input_tokens_seen": 3467210, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.3203125, "step": 163, "time_per_iteration": 3.0873191356658936 }, { "auxiliary_loss_clip": 0.01329569, "auxiliary_loss_mlp": 0.01099409, "balance_loss_clip": 1.0405674, "balance_loss_mlp": 1.09513378, "epoch": 0.00986021343754697, "flos": 17163755556480.0, "grad_norm": 2.5235900048992903, "language_loss": 0.83601165, "learning_rate": 3.9990634922412e-06, "loss": 0.86030143, "num_input_tokens_seen": 3483220, "router_z_loss_clip": 0.58984375, "router_z_loss_mlp": 2.34375, "step": 164, "time_per_iteration": 2.525221824645996 }, { "auxiliary_loss_clip": 0.01322732, "auxiliary_loss_mlp": 0.01090181, "balance_loss_clip": 1.03172088, "balance_loss_mlp": 1.08707738, "epoch": 0.00992033669021494, "flos": 17748445962240.0, "grad_norm": 2.0227093827211897, "language_loss": 0.88425285, "learning_rate": 3.9990518956487655e-06, "loss": 0.908382, "num_input_tokens_seen": 3501465, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 2.34375, "step": 165, "time_per_iteration": 2.4771060943603516 }, { "auxiliary_loss_clip": 0.01331029, "auxiliary_loss_mlp": 0.01106093, "balance_loss_clip": 1.0435797, "balance_loss_mlp": 1.09030557, "epoch": 0.00998045994288291, "flos": 25296109724160.0, "grad_norm": 2.4914540557712663, "language_loss": 0.7945292, "learning_rate": 3.9990402277153105e-06, "loss": 0.81890035, "num_input_tokens_seen": 3520480, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 2.40625, "step": 166, "time_per_iteration": 2.5510506629943848 }, { "auxiliary_loss_clip": 0.01329746, "auxiliary_loss_mlp": 0.0110877, "balance_loss_clip": 1.04873562, "balance_loss_mlp": 1.08858144, "epoch": 0.01004058319555088, "flos": 32297801644800.0, "grad_norm": 2.232365353759961, "language_loss": 0.9095034, "learning_rate": 3.999028488441252e-06, "loss": 0.93388855, "num_input_tokens_seen": 3539570, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 2.40625, "step": 167, "time_per_iteration": 2.5629281997680664 }, { "auxiliary_loss_clip": 0.01327017, "auxiliary_loss_mlp": 0.01124412, "balance_loss_clip": 1.066715, "balance_loss_mlp": 1.09082603, "epoch": 0.010100706448218848, "flos": 11319365116800.0, "grad_norm": 5.029144748726258, "language_loss": 0.89412969, "learning_rate": 3.999016677827009e-06, "loss": 0.91864395, "num_input_tokens_seen": 3555465, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 2.359375, "step": 168, "time_per_iteration": 2.4647042751312256 }, { "auxiliary_loss_clip": 0.01318749, "auxiliary_loss_mlp": 0.01095637, "balance_loss_clip": 1.03674769, "balance_loss_mlp": 1.08649659, "epoch": 0.010160829700886819, "flos": 29718103662720.0, "grad_norm": 1.7973544688290481, "language_loss": 0.86207986, "learning_rate": 3.999004795873003e-06, "loss": 0.88622367, "num_input_tokens_seen": 3578970, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 2.3125, "step": 169, "time_per_iteration": 2.584573984146118 }, { "auxiliary_loss_clip": 0.01321291, "auxiliary_loss_mlp": 0.01098267, "balance_loss_clip": 1.03785205, "balance_loss_mlp": 1.0876385, "epoch": 0.010220952953554787, "flos": 20411306536320.0, "grad_norm": 1.9927921484930773, "language_loss": 0.83916354, "learning_rate": 3.998992842579657e-06, "loss": 0.86335915, "num_input_tokens_seen": 3597275, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 2.34375, "step": 170, "time_per_iteration": 2.49104642868042 }, { "auxiliary_loss_clip": 0.01333332, "auxiliary_loss_mlp": 0.01111543, "balance_loss_clip": 1.05127096, "balance_loss_mlp": 1.08931541, "epoch": 0.010281076206222756, "flos": 31283783850240.0, "grad_norm": 2.252100548603337, "language_loss": 0.89015782, "learning_rate": 3.9989808179474e-06, "loss": 0.91460657, "num_input_tokens_seen": 3618905, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 2.4375, "step": 171, "time_per_iteration": 2.5218472480773926 }, { "auxiliary_loss_clip": 0.01321781, "auxiliary_loss_mlp": 0.01102424, "balance_loss_clip": 1.04322505, "balance_loss_mlp": 1.08822954, "epoch": 0.010341199458890726, "flos": 21981176087040.0, "grad_norm": 2.44019530662096, "language_loss": 0.88130593, "learning_rate": 3.998968721976658e-06, "loss": 0.90554798, "num_input_tokens_seen": 3639610, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 2.328125, "step": 172, "time_per_iteration": 2.5287842750549316 }, { "auxiliary_loss_clip": 0.01312446, "auxiliary_loss_mlp": 0.01096913, "balance_loss_clip": 1.03950167, "balance_loss_mlp": 1.0837512, "epoch": 0.010401322711558695, "flos": 30809209472640.0, "grad_norm": 1.803783117703605, "language_loss": 0.80085742, "learning_rate": 3.998956554667865e-06, "loss": 0.82495105, "num_input_tokens_seen": 3664030, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 2.28125, "step": 173, "time_per_iteration": 2.5845518112182617 }, { "auxiliary_loss_clip": 0.01322341, "auxiliary_loss_mlp": 0.01106042, "balance_loss_clip": 1.04863095, "balance_loss_mlp": 1.08603311, "epoch": 0.010461445964226665, "flos": 24714037670400.0, "grad_norm": 1.8406425535950643, "language_loss": 0.82000279, "learning_rate": 3.998944316021455e-06, "loss": 0.84428656, "num_input_tokens_seen": 3683615, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 2.375, "step": 174, "time_per_iteration": 5.432522535324097 }, { "auxiliary_loss_clip": 0.01321477, "auxiliary_loss_mlp": 0.01108623, "balance_loss_clip": 1.04925632, "balance_loss_mlp": 1.08508635, "epoch": 0.010521569216894634, "flos": 27709096631040.0, "grad_norm": 3.0811273337718794, "language_loss": 0.72226876, "learning_rate": 3.9989320060378634e-06, "loss": 0.74656975, "num_input_tokens_seen": 3704540, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 2.375, "step": 175, "time_per_iteration": 3.943849563598633 }, { "auxiliary_loss_clip": 0.0132293, "auxiliary_loss_mlp": 0.01111667, "balance_loss_clip": 1.0503459, "balance_loss_mlp": 1.08794677, "epoch": 0.010581692469562603, "flos": 12457533306240.0, "grad_norm": 2.5844798526471795, "language_loss": 0.96816218, "learning_rate": 3.998919624717531e-06, "loss": 0.99250817, "num_input_tokens_seen": 3721320, "router_z_loss_clip": 0.61328125, "router_z_loss_mlp": 2.359375, "step": 176, "time_per_iteration": 2.4978418350219727 }, { "auxiliary_loss_clip": 0.013113, "auxiliary_loss_mlp": 0.01106073, "balance_loss_clip": 1.05104566, "balance_loss_mlp": 1.0845108, "epoch": 0.010641815722230573, "flos": 19426581239040.0, "grad_norm": 2.4681889646441357, "language_loss": 0.76004493, "learning_rate": 3.998907172060898e-06, "loss": 0.78421861, "num_input_tokens_seen": 3739385, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 2.265625, "step": 177, "time_per_iteration": 2.4572675228118896 }, { "auxiliary_loss_clip": 0.01319403, "auxiliary_loss_mlp": 0.0109398, "balance_loss_clip": 1.03518641, "balance_loss_mlp": 1.08404136, "epoch": 0.010701938974898541, "flos": 18331600268160.0, "grad_norm": 2.4012411743357758, "language_loss": 0.75504708, "learning_rate": 3.9988946480684115e-06, "loss": 0.77918088, "num_input_tokens_seen": 3756360, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 2.359375, "step": 178, "time_per_iteration": 2.4346606731414795 }, { "auxiliary_loss_clip": 0.01326096, "auxiliary_loss_mlp": 0.01092432, "balance_loss_clip": 1.0330658, "balance_loss_mlp": 1.08672309, "epoch": 0.010762062227566512, "flos": 19203102426240.0, "grad_norm": 2.2127760174419295, "language_loss": 0.8330009, "learning_rate": 3.998882052740516e-06, "loss": 0.85718614, "num_input_tokens_seen": 3773930, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 2.40625, "step": 179, "time_per_iteration": 2.4346044063568115 }, { "auxiliary_loss_clip": 0.01315178, "auxiliary_loss_mlp": 0.01085268, "balance_loss_clip": 1.0242331, "balance_loss_mlp": 1.07958055, "epoch": 0.01082218548023448, "flos": 31424239716480.0, "grad_norm": 1.9845702506607297, "language_loss": 0.83313191, "learning_rate": 3.9988693860776616e-06, "loss": 0.85713637, "num_input_tokens_seen": 3793630, "router_z_loss_clip": 0.609375, "router_z_loss_mlp": 2.359375, "step": 180, "time_per_iteration": 2.625657558441162 }, { "auxiliary_loss_clip": 0.01318714, "auxiliary_loss_mlp": 0.01090594, "balance_loss_clip": 1.03275371, "balance_loss_mlp": 1.08474123, "epoch": 0.01088230873290245, "flos": 25045258538880.0, "grad_norm": 2.7088351245404394, "language_loss": 0.77022505, "learning_rate": 3.998856648080301e-06, "loss": 0.79431814, "num_input_tokens_seen": 3813610, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 2.34375, "step": 181, "time_per_iteration": 2.4812734127044678 }, { "auxiliary_loss_clip": 0.01312901, "auxiliary_loss_mlp": 0.01094778, "balance_loss_clip": 1.03638959, "balance_loss_mlp": 1.0799551, "epoch": 0.01094243198557042, "flos": 22892304504960.0, "grad_norm": 2.8413661282454994, "language_loss": 0.75974607, "learning_rate": 3.998843838748888e-06, "loss": 0.78382289, "num_input_tokens_seen": 3831390, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 2.328125, "step": 182, "time_per_iteration": 2.48538875579834 }, { "auxiliary_loss_clip": 0.01313813, "auxiliary_loss_mlp": 0.01107224, "balance_loss_clip": 1.04876375, "balance_loss_mlp": 1.08262396, "epoch": 0.011002555238238388, "flos": 17164104670080.0, "grad_norm": 2.0731930225158735, "language_loss": 0.86371708, "learning_rate": 3.9988309580838796e-06, "loss": 0.88792747, "num_input_tokens_seen": 3849705, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 2.3125, "step": 183, "time_per_iteration": 2.426077365875244 }, { "auxiliary_loss_clip": 0.01314354, "auxiliary_loss_mlp": 0.01103889, "balance_loss_clip": 1.04771781, "balance_loss_mlp": 1.08373165, "epoch": 0.011062678490906358, "flos": 22309045464960.0, "grad_norm": 2.040898168728987, "language_loss": 0.85582656, "learning_rate": 3.998818006085736e-06, "loss": 0.88000894, "num_input_tokens_seen": 3869230, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 2.3125, "step": 184, "time_per_iteration": 2.4764184951782227 }, { "auxiliary_loss_clip": 0.01309698, "auxiliary_loss_mlp": 0.0109481, "balance_loss_clip": 1.0387814, "balance_loss_mlp": 1.0818429, "epoch": 0.011122801743574327, "flos": 24387250544640.0, "grad_norm": 1.9350126346676748, "language_loss": 0.8281703, "learning_rate": 3.99880498275492e-06, "loss": 0.85221541, "num_input_tokens_seen": 3889735, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 2.28125, "step": 185, "time_per_iteration": 2.5032644271850586 }, { "auxiliary_loss_clip": 0.01317324, "auxiliary_loss_mlp": 0.01090762, "balance_loss_clip": 1.03339851, "balance_loss_mlp": 1.08348095, "epoch": 0.011182924996242297, "flos": 18149283815040.0, "grad_norm": 2.111643803506232, "language_loss": 0.70618719, "learning_rate": 3.9987918880918946e-06, "loss": 0.73026806, "num_input_tokens_seen": 3908855, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 2.34375, "step": 186, "time_per_iteration": 2.4431312084198 }, { "auxiliary_loss_clip": 0.01311425, "auxiliary_loss_mlp": 0.01093057, "balance_loss_clip": 1.03919864, "balance_loss_mlp": 1.07877183, "epoch": 0.011243048248910266, "flos": 15485899570560.0, "grad_norm": 2.569855957246881, "language_loss": 1.01050854, "learning_rate": 3.9987787220971295e-06, "loss": 1.03455341, "num_input_tokens_seen": 3923865, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 2.328125, "step": 187, "time_per_iteration": 2.4374749660491943 }, { "auxiliary_loss_clip": 0.01303492, "auxiliary_loss_mlp": 0.01099116, "balance_loss_clip": 1.04146647, "balance_loss_mlp": 1.0788306, "epoch": 0.011303171501578235, "flos": 40915273420800.0, "grad_norm": 2.548479597549839, "language_loss": 0.7428776, "learning_rate": 3.9987654847710925e-06, "loss": 0.76690364, "num_input_tokens_seen": 3946870, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 2.25, "step": 188, "time_per_iteration": 2.6361711025238037 }, { "auxiliary_loss_clip": 0.01240898, "auxiliary_loss_mlp": 0.01070704, "balance_loss_clip": 1.04972339, "balance_loss_mlp": 1.1219821, "epoch": 0.011363294754246205, "flos": 66299607884160.0, "grad_norm": 0.7354062841985437, "language_loss": 0.56136906, "learning_rate": 3.998752176114257e-06, "loss": 0.58448505, "num_input_tokens_seen": 4010005, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.1875, "step": 189, "time_per_iteration": 3.160338878631592 }, { "auxiliary_loss_clip": 0.01305183, "auxiliary_loss_mlp": 0.01101284, "balance_loss_clip": 1.04456365, "balance_loss_mlp": 1.0788312, "epoch": 0.011423418006914174, "flos": 24899112120960.0, "grad_norm": 2.1583810659975446, "language_loss": 0.93964595, "learning_rate": 3.998738796127097e-06, "loss": 0.96371061, "num_input_tokens_seen": 4029035, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 2.265625, "step": 190, "time_per_iteration": 2.4950332641601562 }, { "auxiliary_loss_clip": 0.0130113, "auxiliary_loss_mlp": 0.0108689, "balance_loss_clip": 1.03446198, "balance_loss_mlp": 1.07827342, "epoch": 0.011483541259582144, "flos": 19790865031680.0, "grad_norm": 2.8798929290255653, "language_loss": 0.84118086, "learning_rate": 3.998725344810092e-06, "loss": 0.8650611, "num_input_tokens_seen": 4046995, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 2.234375, "step": 191, "time_per_iteration": 2.474900484085083 }, { "auxiliary_loss_clip": 0.01305111, "auxiliary_loss_mlp": 0.01097284, "balance_loss_clip": 1.04456949, "balance_loss_mlp": 1.0755477, "epoch": 0.011543664512250112, "flos": 26175746229120.0, "grad_norm": 1.8549438389464499, "language_loss": 0.91263413, "learning_rate": 3.99871182216372e-06, "loss": 0.93665814, "num_input_tokens_seen": 4065865, "router_z_loss_clip": 0.52734375, "router_z_loss_mlp": 2.296875, "step": 192, "time_per_iteration": 2.534113645553589 }, { "auxiliary_loss_clip": 0.01302597, "auxiliary_loss_mlp": 0.0110381, "balance_loss_clip": 1.04594564, "balance_loss_mlp": 1.07853484, "epoch": 0.011603787764918083, "flos": 23767856380800.0, "grad_norm": 2.087142401840174, "language_loss": 0.86185181, "learning_rate": 3.998698228188465e-06, "loss": 0.88591588, "num_input_tokens_seen": 4085305, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 2.25, "step": 193, "time_per_iteration": 2.4675991535186768 }, { "auxiliary_loss_clip": 0.01300011, "auxiliary_loss_mlp": 0.01100296, "balance_loss_clip": 1.04460144, "balance_loss_mlp": 1.07354963, "epoch": 0.011663911017586051, "flos": 25953594048000.0, "grad_norm": 2.5113253385065124, "language_loss": 0.91893256, "learning_rate": 3.9986845628848115e-06, "loss": 0.94293571, "num_input_tokens_seen": 4105185, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 2.265625, "step": 194, "time_per_iteration": 2.4987685680389404 }, { "auxiliary_loss_clip": 0.01306807, "auxiliary_loss_mlp": 0.01093733, "balance_loss_clip": 1.0397315, "balance_loss_mlp": 1.07947361, "epoch": 0.01172403427025402, "flos": 17894173443840.0, "grad_norm": 2.357181387982265, "language_loss": 0.88895011, "learning_rate": 3.998670826253246e-06, "loss": 0.91295552, "num_input_tokens_seen": 4123160, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 2.28125, "step": 195, "time_per_iteration": 2.435065507888794 }, { "auxiliary_loss_clip": 0.01300711, "auxiliary_loss_mlp": 0.0109839, "balance_loss_clip": 1.04267144, "balance_loss_mlp": 1.07752693, "epoch": 0.01178415752292199, "flos": 17893579950720.0, "grad_norm": 2.2137038281252814, "language_loss": 0.84706283, "learning_rate": 3.998657018294261e-06, "loss": 0.87105381, "num_input_tokens_seen": 4140425, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 2.234375, "step": 196, "time_per_iteration": 2.449734926223755 }, { "auxiliary_loss_clip": 0.01303728, "auxiliary_loss_mlp": 0.01095433, "balance_loss_clip": 1.03992963, "balance_loss_mlp": 1.07673109, "epoch": 0.011844280775589959, "flos": 22892444150400.0, "grad_norm": 2.21390164074501, "language_loss": 0.9224143, "learning_rate": 3.998643139008348e-06, "loss": 0.94640595, "num_input_tokens_seen": 4159555, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 2.265625, "step": 197, "time_per_iteration": 2.449037551879883 }, { "auxiliary_loss_clip": 0.0129758, "auxiliary_loss_mlp": 0.01095406, "balance_loss_clip": 1.04073656, "balance_loss_mlp": 1.07462072, "epoch": 0.01190440402825793, "flos": 26979097680000.0, "grad_norm": 1.9318876007715875, "language_loss": 0.78542089, "learning_rate": 3.998629188396002e-06, "loss": 0.80935079, "num_input_tokens_seen": 4180480, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 2.21875, "step": 198, "time_per_iteration": 2.5457894802093506 }, { "auxiliary_loss_clip": 0.01295783, "auxiliary_loss_mlp": 0.01092376, "balance_loss_clip": 1.03916073, "balance_loss_mlp": 1.07411599, "epoch": 0.011964527280925898, "flos": 20520549780480.0, "grad_norm": 1.9546481104925915, "language_loss": 0.87513494, "learning_rate": 3.9986151664577225e-06, "loss": 0.8990165, "num_input_tokens_seen": 4198835, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 2.21875, "step": 199, "time_per_iteration": 2.43874454498291 }, { "auxiliary_loss_clip": 0.01303553, "auxiliary_loss_mlp": 0.01105744, "balance_loss_clip": 1.04752231, "balance_loss_mlp": 1.07472932, "epoch": 0.012024650533593867, "flos": 27744742995840.0, "grad_norm": 1.9422627423945882, "language_loss": 0.8069098, "learning_rate": 3.998601073194007e-06, "loss": 0.83100271, "num_input_tokens_seen": 4219335, "router_z_loss_clip": 0.58203125, "router_z_loss_mlp": 2.28125, "step": 200, "time_per_iteration": 2.496249198913574 }, { "auxiliary_loss_clip": 0.01295899, "auxiliary_loss_mlp": 0.01089628, "balance_loss_clip": 1.033409, "balance_loss_mlp": 1.07088518, "epoch": 0.012084773786261837, "flos": 10451249360640.0, "grad_norm": 2.2723968814470914, "language_loss": 0.86802953, "learning_rate": 3.998586908605362e-06, "loss": 0.89188486, "num_input_tokens_seen": 4236940, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 2.25, "step": 201, "time_per_iteration": 2.4100606441497803 }, { "auxiliary_loss_clip": 0.01300387, "auxiliary_loss_mlp": 0.01099245, "balance_loss_clip": 1.04326439, "balance_loss_mlp": 1.07630789, "epoch": 0.012144897038929806, "flos": 23104821150720.0, "grad_norm": 1.7247674530220773, "language_loss": 0.83746284, "learning_rate": 3.99857267269229e-06, "loss": 0.86145914, "num_input_tokens_seen": 4256755, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 2.25, "step": 202, "time_per_iteration": 2.495171546936035 }, { "auxiliary_loss_clip": 0.01290128, "auxiliary_loss_mlp": 0.01090998, "balance_loss_clip": 1.03797424, "balance_loss_mlp": 1.06840992, "epoch": 0.012205020291597776, "flos": 21032132065920.0, "grad_norm": 1.8188762982178481, "language_loss": 0.89072442, "learning_rate": 3.9985583654553e-06, "loss": 0.9145357, "num_input_tokens_seen": 4276505, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 2.21875, "step": 203, "time_per_iteration": 2.4408295154571533 }, { "auxiliary_loss_clip": 0.01245051, "auxiliary_loss_mlp": 0.0107076, "balance_loss_clip": 1.04958832, "balance_loss_mlp": 1.12554657, "epoch": 0.012265143544265745, "flos": 68444846507520.0, "grad_norm": 0.9907239168515048, "language_loss": 0.61084068, "learning_rate": 3.998543986894904e-06, "loss": 0.63399887, "num_input_tokens_seen": 4330965, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.1953125, "step": 204, "time_per_iteration": 3.0198233127593994 }, { "auxiliary_loss_clip": 0.01300788, "auxiliary_loss_mlp": 0.01098754, "balance_loss_clip": 1.04201007, "balance_loss_mlp": 1.07181823, "epoch": 0.012325266796933715, "flos": 17018307365760.0, "grad_norm": 2.2994236391340808, "language_loss": 0.90849137, "learning_rate": 3.9985295370116135e-06, "loss": 0.93248677, "num_input_tokens_seen": 4348200, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 2.28125, "step": 205, "time_per_iteration": 2.4178733825683594 }, { "auxiliary_loss_clip": 0.0130442, "auxiliary_loss_mlp": 0.01111671, "balance_loss_clip": 1.0553323, "balance_loss_mlp": 1.07182193, "epoch": 0.012385390049601683, "flos": 20189119443840.0, "grad_norm": 2.3596882713470264, "language_loss": 0.88420367, "learning_rate": 3.998515015805945e-06, "loss": 0.90836465, "num_input_tokens_seen": 4365460, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 2.3125, "step": 206, "time_per_iteration": 2.469921588897705 }, { "auxiliary_loss_clip": 0.01295143, "auxiliary_loss_mlp": 0.01092373, "balance_loss_clip": 1.03915834, "balance_loss_mlp": 1.07029605, "epoch": 0.012445513302269652, "flos": 16252208202240.0, "grad_norm": 2.085862417309902, "language_loss": 0.94610721, "learning_rate": 3.998500423278416e-06, "loss": 0.96998239, "num_input_tokens_seen": 4383650, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 2.25, "step": 207, "time_per_iteration": 2.4121739864349365 }, { "auxiliary_loss_clip": 0.01296356, "auxiliary_loss_mlp": 0.01095686, "balance_loss_clip": 1.04142165, "balance_loss_mlp": 1.07297158, "epoch": 0.012505636554937622, "flos": 23768240405760.0, "grad_norm": 2.3786309079219756, "language_loss": 0.74965876, "learning_rate": 3.998485759429547e-06, "loss": 0.77357912, "num_input_tokens_seen": 4403765, "router_z_loss_clip": 0.54296875, "router_z_loss_mlp": 2.234375, "step": 208, "time_per_iteration": 2.555402994155884 }, { "auxiliary_loss_clip": 0.01286573, "auxiliary_loss_mlp": 0.01087111, "balance_loss_clip": 1.0330857, "balance_loss_mlp": 1.06945479, "epoch": 0.012565759807605591, "flos": 30590234225280.0, "grad_norm": 9.246201609274017, "language_loss": 0.98260844, "learning_rate": 3.998471024259863e-06, "loss": 1.00634527, "num_input_tokens_seen": 4421935, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 2.171875, "step": 209, "time_per_iteration": 2.5096023082733154 }, { "auxiliary_loss_clip": 0.01297111, "auxiliary_loss_mlp": 0.01103891, "balance_loss_clip": 1.04996026, "balance_loss_mlp": 1.07318473, "epoch": 0.012625883060273561, "flos": 40111956881280.0, "grad_norm": 2.8645082772694743, "language_loss": 0.84888291, "learning_rate": 3.998456217769888e-06, "loss": 0.87289298, "num_input_tokens_seen": 4441470, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 2.234375, "step": 210, "time_per_iteration": 2.654449939727783 }, { "auxiliary_loss_clip": 0.01288543, "auxiliary_loss_mlp": 0.01107358, "balance_loss_clip": 1.05447721, "balance_loss_mlp": 1.06919765, "epoch": 0.01268600631294153, "flos": 27087956899200.0, "grad_norm": 2.373016707420503, "language_loss": 0.96118057, "learning_rate": 3.998441339960152e-06, "loss": 0.98513967, "num_input_tokens_seen": 4459950, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 2.1875, "step": 211, "time_per_iteration": 2.4860005378723145 }, { "auxiliary_loss_clip": 0.01300727, "auxiliary_loss_mlp": 0.01111718, "balance_loss_clip": 1.0554986, "balance_loss_mlp": 1.07486534, "epoch": 0.012746129565609499, "flos": 16981823128320.0, "grad_norm": 2.2582161522130466, "language_loss": 0.94642508, "learning_rate": 3.998426390831185e-06, "loss": 0.97054946, "num_input_tokens_seen": 4478390, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 2.25, "step": 212, "time_per_iteration": 2.4837257862091064 }, { "auxiliary_loss_clip": 0.01289522, "auxiliary_loss_mlp": 0.01092615, "balance_loss_clip": 1.04164124, "balance_loss_mlp": 1.0723896, "epoch": 0.012806252818277469, "flos": 46531786216320.0, "grad_norm": 1.6722825749467651, "language_loss": 0.75558621, "learning_rate": 3.998411370383521e-06, "loss": 0.7794075, "num_input_tokens_seen": 4501665, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 2.171875, "step": 213, "time_per_iteration": 5.504590272903442 }, { "auxiliary_loss_clip": 0.01289584, "auxiliary_loss_mlp": 0.01098085, "balance_loss_clip": 1.0462532, "balance_loss_mlp": 1.06899095, "epoch": 0.012866376070945438, "flos": 14387846400000.0, "grad_norm": 6.023317248147852, "language_loss": 0.85730284, "learning_rate": 3.9983962786176945e-06, "loss": 0.88117963, "num_input_tokens_seen": 4519055, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 2.203125, "step": 214, "time_per_iteration": 3.83119797706604 }, { "auxiliary_loss_clip": 0.01287974, "auxiliary_loss_mlp": 0.01113797, "balance_loss_clip": 1.05791199, "balance_loss_mlp": 1.07126224, "epoch": 0.012926499323613408, "flos": 26139611105280.0, "grad_norm": 1.9864199504565263, "language_loss": 0.76788223, "learning_rate": 3.9983811155342465e-06, "loss": 0.79189986, "num_input_tokens_seen": 4540870, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 2.15625, "step": 215, "time_per_iteration": 3.860884666442871 }, { "auxiliary_loss_clip": 0.01301141, "auxiliary_loss_mlp": 0.01100734, "balance_loss_clip": 1.04892564, "balance_loss_mlp": 1.07581246, "epoch": 0.012986622576281377, "flos": 30115904227200.0, "grad_norm": 2.076965771857895, "language_loss": 0.89427274, "learning_rate": 3.998365881133717e-06, "loss": 0.91829151, "num_input_tokens_seen": 4560395, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 2.25, "step": 216, "time_per_iteration": 2.5689918994903564 }, { "auxiliary_loss_clip": 0.01289735, "auxiliary_loss_mlp": 0.01099849, "balance_loss_clip": 1.04475069, "balance_loss_mlp": 1.06854296, "epoch": 0.013046745828949347, "flos": 13953177573120.0, "grad_norm": 2.822835372352441, "language_loss": 0.93123031, "learning_rate": 3.998350575416648e-06, "loss": 0.95512605, "num_input_tokens_seen": 4575785, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 2.21875, "step": 217, "time_per_iteration": 2.40677809715271 }, { "auxiliary_loss_clip": 0.0128734, "auxiliary_loss_mlp": 0.01093811, "balance_loss_clip": 1.03728223, "balance_loss_mlp": 1.06867433, "epoch": 0.013106869081617315, "flos": 17346874970880.0, "grad_norm": 1.96990105044481, "language_loss": 0.92702591, "learning_rate": 3.9983351983835885e-06, "loss": 0.95083737, "num_input_tokens_seen": 4594985, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 2.1875, "step": 218, "time_per_iteration": 2.459202766418457 }, { "auxiliary_loss_clip": 0.01281061, "auxiliary_loss_mlp": 0.01092205, "balance_loss_clip": 1.0364871, "balance_loss_mlp": 1.06463194, "epoch": 0.013166992334285284, "flos": 25883732684160.0, "grad_norm": 2.0566708002092895, "language_loss": 0.85948598, "learning_rate": 3.998319750035087e-06, "loss": 0.88321859, "num_input_tokens_seen": 4616125, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 2.15625, "step": 219, "time_per_iteration": 2.4813380241394043 }, { "auxiliary_loss_clip": 0.01284623, "auxiliary_loss_mlp": 0.01085259, "balance_loss_clip": 1.03380799, "balance_loss_mlp": 1.06691563, "epoch": 0.013227115586953254, "flos": 31174610428800.0, "grad_norm": 1.7846173857072796, "language_loss": 0.87097883, "learning_rate": 3.998304230371692e-06, "loss": 0.8946777, "num_input_tokens_seen": 4637795, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 2.171875, "step": 220, "time_per_iteration": 2.534933090209961 }, { "auxiliary_loss_clip": 0.01278089, "auxiliary_loss_mlp": 0.01087831, "balance_loss_clip": 1.03773904, "balance_loss_mlp": 1.06271708, "epoch": 0.013287238839621223, "flos": 20408513627520.0, "grad_norm": 1.8386479521990724, "language_loss": 0.86070645, "learning_rate": 3.99828863939396e-06, "loss": 0.88436568, "num_input_tokens_seen": 4656835, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 2.15625, "step": 221, "time_per_iteration": 2.4281139373779297 }, { "auxiliary_loss_clip": 0.01285994, "auxiliary_loss_mlp": 0.01091569, "balance_loss_clip": 1.03704226, "balance_loss_mlp": 1.06290507, "epoch": 0.013347362092289193, "flos": 14136262076160.0, "grad_norm": 2.093337358758933, "language_loss": 0.91403848, "learning_rate": 3.998272977102448e-06, "loss": 0.93781406, "num_input_tokens_seen": 4673015, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 2.21875, "step": 222, "time_per_iteration": 2.4241268634796143 }, { "auxiliary_loss_clip": 0.01277546, "auxiliary_loss_mlp": 0.01089054, "balance_loss_clip": 1.03266788, "balance_loss_mlp": 1.06376529, "epoch": 0.013407485344957162, "flos": 21796660218240.0, "grad_norm": 2.2189139260236526, "language_loss": 0.94726562, "learning_rate": 3.998257243497712e-06, "loss": 0.97093159, "num_input_tokens_seen": 4692355, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 2.140625, "step": 223, "time_per_iteration": 2.4427013397216797 }, { "auxiliary_loss_clip": 0.01275896, "auxiliary_loss_mlp": 0.01088529, "balance_loss_clip": 1.03576696, "balance_loss_mlp": 1.06117606, "epoch": 0.013467608597625132, "flos": 18620716170240.0, "grad_norm": 2.8381194576812163, "language_loss": 0.87496227, "learning_rate": 3.998241438580316e-06, "loss": 0.89860654, "num_input_tokens_seen": 4710080, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 2.15625, "step": 224, "time_per_iteration": 2.4462950229644775 }, { "auxiliary_loss_clip": 0.01276996, "auxiliary_loss_mlp": 0.01084642, "balance_loss_clip": 1.02935266, "balance_loss_mlp": 1.06108713, "epoch": 0.013527731850293101, "flos": 18551308654080.0, "grad_norm": 2.1273599177350144, "language_loss": 0.88692373, "learning_rate": 3.998225562350823e-06, "loss": 0.9105401, "num_input_tokens_seen": 4728980, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 2.15625, "step": 225, "time_per_iteration": 2.4228098392486572 }, { "auxiliary_loss_clip": 0.01272484, "auxiliary_loss_mlp": 0.01101737, "balance_loss_clip": 1.04432523, "balance_loss_mlp": 1.06158555, "epoch": 0.01358785510296107, "flos": 19164558418560.0, "grad_norm": 1.815503315808218, "language_loss": 0.98583525, "learning_rate": 3.998209614809799e-06, "loss": 1.00957751, "num_input_tokens_seen": 4747020, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 2.109375, "step": 226, "time_per_iteration": 2.455432176589966 }, { "auxiliary_loss_clip": 0.01278348, "auxiliary_loss_mlp": 0.01087459, "balance_loss_clip": 1.03457808, "balance_loss_mlp": 1.06498325, "epoch": 0.01364797835562904, "flos": 23328858545280.0, "grad_norm": 2.7114359242763126, "language_loss": 0.90125763, "learning_rate": 3.9981935959578145e-06, "loss": 0.92491573, "num_input_tokens_seen": 4765000, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 2.125, "step": 227, "time_per_iteration": 2.450788736343384 }, { "auxiliary_loss_clip": 0.01199205, "auxiliary_loss_mlp": 0.01042575, "balance_loss_clip": 1.0257901, "balance_loss_mlp": 1.0826751, "epoch": 0.013708101608297009, "flos": 70989943599360.0, "grad_norm": 0.9286409859088512, "language_loss": 0.57483828, "learning_rate": 3.99817750579544e-06, "loss": 0.59725606, "num_input_tokens_seen": 4833210, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 1.171875, "step": 228, "time_per_iteration": 3.1797547340393066 }, { "auxiliary_loss_clip": 0.01271341, "auxiliary_loss_mlp": 0.01092144, "balance_loss_clip": 1.04128909, "balance_loss_mlp": 1.06173563, "epoch": 0.013768224860964979, "flos": 16324268981760.0, "grad_norm": 2.2875216044741458, "language_loss": 0.86467117, "learning_rate": 3.998161344323251e-06, "loss": 0.88830602, "num_input_tokens_seen": 4850120, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 2.09375, "step": 229, "time_per_iteration": 2.410629987716675 }, { "auxiliary_loss_clip": 0.01274439, "auxiliary_loss_mlp": 0.01091076, "balance_loss_clip": 1.03631115, "balance_loss_mlp": 1.05819178, "epoch": 0.013828348113632948, "flos": 20192017086720.0, "grad_norm": 4.939246484573645, "language_loss": 0.83541977, "learning_rate": 3.998145111541823e-06, "loss": 0.85907495, "num_input_tokens_seen": 4866215, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 2.15625, "step": 230, "time_per_iteration": 2.4956488609313965 }, { "auxiliary_loss_clip": 0.01271041, "auxiliary_loss_mlp": 0.01091215, "balance_loss_clip": 1.04019332, "balance_loss_mlp": 1.05893278, "epoch": 0.013888471366300916, "flos": 20740013786880.0, "grad_norm": 1.8456005596458809, "language_loss": 0.89727223, "learning_rate": 3.998128807451736e-06, "loss": 0.9208948, "num_input_tokens_seen": 4885630, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 2.125, "step": 231, "time_per_iteration": 2.4690020084381104 }, { "auxiliary_loss_clip": 0.01272094, "auxiliary_loss_mlp": 0.01096445, "balance_loss_clip": 1.0455898, "balance_loss_mlp": 1.05948091, "epoch": 0.013948594618968886, "flos": 22089546547200.0, "grad_norm": 2.7312656935955193, "language_loss": 0.83418334, "learning_rate": 3.9981124320535715e-06, "loss": 0.85786867, "num_input_tokens_seen": 4905570, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 2.125, "step": 232, "time_per_iteration": 2.4867002964019775 }, { "auxiliary_loss_clip": 0.01279185, "auxiliary_loss_mlp": 0.01088507, "balance_loss_clip": 1.03290725, "balance_loss_mlp": 1.05728006, "epoch": 0.014008717871636855, "flos": 19062087978240.0, "grad_norm": 3.2014549539474055, "language_loss": 0.73482341, "learning_rate": 3.998095985347915e-06, "loss": 0.75850034, "num_input_tokens_seen": 4923535, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 2.21875, "step": 233, "time_per_iteration": 2.4370038509368896 }, { "auxiliary_loss_clip": 0.01278501, "auxiliary_loss_mlp": 0.0110119, "balance_loss_clip": 1.04537606, "balance_loss_mlp": 1.06214762, "epoch": 0.014068841124304825, "flos": 14530152568320.0, "grad_norm": 2.3160215159406676, "language_loss": 0.84934628, "learning_rate": 3.998079467335351e-06, "loss": 0.87314326, "num_input_tokens_seen": 4939200, "router_z_loss_clip": 0.55859375, "router_z_loss_mlp": 2.15625, "step": 234, "time_per_iteration": 2.4662678241729736 }, { "auxiliary_loss_clip": 0.01272153, "auxiliary_loss_mlp": 0.01089142, "balance_loss_clip": 1.03957438, "balance_loss_mlp": 1.05987799, "epoch": 0.014128964376972794, "flos": 18076420074240.0, "grad_norm": 2.50023724061121, "language_loss": 0.88307524, "learning_rate": 3.998062878016471e-06, "loss": 0.90668821, "num_input_tokens_seen": 4956620, "router_z_loss_clip": 0.49609375, "router_z_loss_mlp": 2.125, "step": 235, "time_per_iteration": 2.4285151958465576 }, { "auxiliary_loss_clip": 0.01270289, "auxiliary_loss_mlp": 0.01100538, "balance_loss_clip": 1.0480144, "balance_loss_mlp": 1.06112003, "epoch": 0.014189087629640764, "flos": 25333257277440.0, "grad_norm": 2.2535235264048796, "language_loss": 0.85064286, "learning_rate": 3.998046217391867e-06, "loss": 0.87435114, "num_input_tokens_seen": 4975650, "router_z_loss_clip": 0.52734375, "router_z_loss_mlp": 2.09375, "step": 236, "time_per_iteration": 2.5147581100463867 }, { "auxiliary_loss_clip": 0.01272199, "auxiliary_loss_mlp": 0.01086088, "balance_loss_clip": 1.03139448, "balance_loss_mlp": 1.05876279, "epoch": 0.014249210882308733, "flos": 36138212288640.0, "grad_norm": 1.9080197335876328, "language_loss": 0.81960863, "learning_rate": 3.9980294854621325e-06, "loss": 0.8431915, "num_input_tokens_seen": 4997415, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 2.140625, "step": 237, "time_per_iteration": 2.575861692428589 }, { "auxiliary_loss_clip": 0.01267233, "auxiliary_loss_mlp": 0.0110332, "balance_loss_clip": 1.04819798, "balance_loss_mlp": 1.05855203, "epoch": 0.014309334134976702, "flos": 12932142595200.0, "grad_norm": 2.3071106764312312, "language_loss": 0.76282841, "learning_rate": 3.998012682227866e-06, "loss": 0.78653395, "num_input_tokens_seen": 5013905, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 2.09375, "step": 238, "time_per_iteration": 2.439086675643921 }, { "auxiliary_loss_clip": 0.0126541, "auxiliary_loss_mlp": 0.01091746, "balance_loss_clip": 1.04174924, "balance_loss_mlp": 1.05934358, "epoch": 0.014369457387644672, "flos": 20776463112960.0, "grad_norm": 2.189294612300346, "language_loss": 0.86273628, "learning_rate": 3.9979958076896655e-06, "loss": 0.88630784, "num_input_tokens_seen": 5033645, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 2.0625, "step": 239, "time_per_iteration": 2.4491758346557617 }, { "auxiliary_loss_clip": 0.01255269, "auxiliary_loss_mlp": 0.0108271, "balance_loss_clip": 1.03245175, "balance_loss_mlp": 1.05461502, "epoch": 0.01442958064031264, "flos": 25847353180800.0, "grad_norm": 2.0680005127153183, "language_loss": 0.92302793, "learning_rate": 3.997978861848135e-06, "loss": 0.94640774, "num_input_tokens_seen": 5052875, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 2.015625, "step": 240, "time_per_iteration": 2.511864185333252 }, { "auxiliary_loss_clip": 0.01260894, "auxiliary_loss_mlp": 0.01088302, "balance_loss_clip": 1.03828204, "balance_loss_mlp": 1.05558801, "epoch": 0.014489703892980611, "flos": 28218479500800.0, "grad_norm": 2.0359378222038345, "language_loss": 0.84616089, "learning_rate": 3.997961844703877e-06, "loss": 0.86965281, "num_input_tokens_seen": 5075005, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 2.046875, "step": 241, "time_per_iteration": 2.5065231323242188 }, { "auxiliary_loss_clip": 0.01259675, "auxiliary_loss_mlp": 0.01095578, "balance_loss_clip": 1.04260087, "balance_loss_mlp": 1.06072092, "epoch": 0.01454982714564858, "flos": 22489860729600.0, "grad_norm": 2.1281324971549664, "language_loss": 0.87685394, "learning_rate": 3.997944756257501e-06, "loss": 0.90040648, "num_input_tokens_seen": 5091875, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 1.984375, "step": 242, "time_per_iteration": 2.4625511169433594 }, { "auxiliary_loss_clip": 0.01262409, "auxiliary_loss_mlp": 0.01079613, "balance_loss_clip": 1.02866244, "balance_loss_mlp": 1.05539179, "epoch": 0.014609950398316548, "flos": 21652119722880.0, "grad_norm": 2.0902873867775877, "language_loss": 0.85707223, "learning_rate": 3.997927596509616e-06, "loss": 0.88049245, "num_input_tokens_seen": 5111290, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 2.0625, "step": 243, "time_per_iteration": 2.4473350048065186 }, { "auxiliary_loss_clip": 0.01269157, "auxiliary_loss_mlp": 0.0109046, "balance_loss_clip": 1.03746009, "balance_loss_mlp": 1.05946577, "epoch": 0.014670073650984519, "flos": 21868965377280.0, "grad_norm": 1.566170571801324, "language_loss": 0.83990335, "learning_rate": 3.997910365460834e-06, "loss": 0.86349952, "num_input_tokens_seen": 5132265, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 2.09375, "step": 244, "time_per_iteration": 2.5066041946411133 }, { "auxiliary_loss_clip": 0.01268433, "auxiliary_loss_mlp": 0.01101666, "balance_loss_clip": 1.04601955, "balance_loss_mlp": 1.05904102, "epoch": 0.014730196903652487, "flos": 23182642304640.0, "grad_norm": 2.681066411928938, "language_loss": 0.78249276, "learning_rate": 3.9978930631117705e-06, "loss": 0.80619383, "num_input_tokens_seen": 5148575, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 2.09375, "step": 245, "time_per_iteration": 2.4403247833251953 }, { "auxiliary_loss_clip": 0.01270861, "auxiliary_loss_mlp": 0.01090869, "balance_loss_clip": 1.03546047, "balance_loss_mlp": 1.05506361, "epoch": 0.014790320156320457, "flos": 23221465603200.0, "grad_norm": 1.9837610932174923, "language_loss": 0.83586812, "learning_rate": 3.997875689463043e-06, "loss": 0.85948539, "num_input_tokens_seen": 5170415, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 2.15625, "step": 246, "time_per_iteration": 2.513209581375122 }, { "auxiliary_loss_clip": 0.01264415, "auxiliary_loss_mlp": 0.01085882, "balance_loss_clip": 1.03235722, "balance_loss_mlp": 1.05463314, "epoch": 0.014850443408988426, "flos": 15814571909760.0, "grad_norm": 2.458078485465398, "language_loss": 0.89064759, "learning_rate": 3.9978582445152705e-06, "loss": 0.9141506, "num_input_tokens_seen": 5188565, "router_z_loss_clip": 0.53515625, "router_z_loss_mlp": 2.09375, "step": 247, "time_per_iteration": 2.411815881729126 }, { "auxiliary_loss_clip": 0.01265272, "auxiliary_loss_mlp": 0.01078643, "balance_loss_clip": 1.02516592, "balance_loss_mlp": 1.05134773, "epoch": 0.014910566661656396, "flos": 22780617465600.0, "grad_norm": 2.05474741939736, "language_loss": 0.77996743, "learning_rate": 3.997840728269077e-06, "loss": 0.8034066, "num_input_tokens_seen": 5207810, "router_z_loss_clip": 0.53515625, "router_z_loss_mlp": 2.140625, "step": 248, "time_per_iteration": 2.5023632049560547 }, { "auxiliary_loss_clip": 0.01266455, "auxiliary_loss_mlp": 0.01094696, "balance_loss_clip": 1.04407978, "balance_loss_mlp": 1.05812287, "epoch": 0.014970689914324365, "flos": 26863954416000.0, "grad_norm": 1.9115641066417266, "language_loss": 0.83001065, "learning_rate": 3.997823140725088e-06, "loss": 0.8536222, "num_input_tokens_seen": 5226210, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 2.09375, "step": 249, "time_per_iteration": 2.513101100921631 }, { "auxiliary_loss_clip": 0.01264516, "auxiliary_loss_mlp": 0.01087825, "balance_loss_clip": 1.03763819, "balance_loss_mlp": 1.05710781, "epoch": 0.015030813166992334, "flos": 13984948776960.0, "grad_norm": 3.3408272000276846, "language_loss": 0.92655754, "learning_rate": 3.997805481883929e-06, "loss": 0.95008093, "num_input_tokens_seen": 5241660, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 2.078125, "step": 250, "time_per_iteration": 2.465294361114502 }, { "auxiliary_loss_clip": 0.01271166, "auxiliary_loss_mlp": 0.01107342, "balance_loss_clip": 1.05267227, "balance_loss_mlp": 1.05898547, "epoch": 0.015090936419660304, "flos": 24716656022400.0, "grad_norm": 2.6396424242306686, "language_loss": 0.96257102, "learning_rate": 3.997787751746231e-06, "loss": 0.98635614, "num_input_tokens_seen": 5261090, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 2.125, "step": 251, "time_per_iteration": 2.4686830043792725 }, { "auxiliary_loss_clip": 0.01261888, "auxiliary_loss_mlp": 0.01093252, "balance_loss_clip": 1.04177761, "balance_loss_mlp": 1.05475163, "epoch": 0.015151059672328273, "flos": 25737621177600.0, "grad_norm": 2.3097383613973905, "language_loss": 0.83784211, "learning_rate": 3.997769950312628e-06, "loss": 0.86139357, "num_input_tokens_seen": 5279175, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 2.0625, "step": 252, "time_per_iteration": 3.93217134475708 }, { "auxiliary_loss_clip": 0.01259553, "auxiliary_loss_mlp": 0.01094354, "balance_loss_clip": 1.04118657, "balance_loss_mlp": 1.05521631, "epoch": 0.015211182924996243, "flos": 21870152363520.0, "grad_norm": 2.096397039732292, "language_loss": 0.97462344, "learning_rate": 3.997752077583753e-06, "loss": 0.99816239, "num_input_tokens_seen": 5296975, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 2.046875, "step": 253, "time_per_iteration": 5.36123251914978 }, { "auxiliary_loss_clip": 0.01183241, "auxiliary_loss_mlp": 0.01029227, "balance_loss_clip": 1.01310992, "balance_loss_mlp": 1.07796979, "epoch": 0.015271306177664212, "flos": 66891734409600.0, "grad_norm": 0.838537053241808, "language_loss": 0.55493897, "learning_rate": 3.997734133560246e-06, "loss": 0.57706368, "num_input_tokens_seen": 5358375, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 1.0546875, "step": 254, "time_per_iteration": 3.1195199489593506 }, { "auxiliary_loss_clip": 0.01263656, "auxiliary_loss_mlp": 0.01102612, "balance_loss_clip": 1.04877758, "balance_loss_mlp": 1.05328155, "epoch": 0.01533142943033218, "flos": 26832846528000.0, "grad_norm": 2.146919372189757, "language_loss": 0.89907759, "learning_rate": 3.997716118242746e-06, "loss": 0.92274028, "num_input_tokens_seen": 5377255, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 2.109375, "step": 255, "time_per_iteration": 3.897091865539551 }, { "auxiliary_loss_clip": 0.01261725, "auxiliary_loss_mlp": 0.01104855, "balance_loss_clip": 1.05206895, "balance_loss_mlp": 1.05353701, "epoch": 0.01539155268300015, "flos": 20812702970880.0, "grad_norm": 2.1854471015532435, "language_loss": 0.84855503, "learning_rate": 3.997698031631898e-06, "loss": 0.87222087, "num_input_tokens_seen": 5395320, "router_z_loss_clip": 0.52734375, "router_z_loss_mlp": 2.078125, "step": 256, "time_per_iteration": 2.495079755783081 }, { "auxiliary_loss_clip": 0.01264272, "auxiliary_loss_mlp": 0.01096694, "balance_loss_clip": 1.04481387, "balance_loss_mlp": 1.05318999, "epoch": 0.01545167593566812, "flos": 15960927795840.0, "grad_norm": 3.135350269790941, "language_loss": 0.70954847, "learning_rate": 3.997679873728344e-06, "loss": 0.73315823, "num_input_tokens_seen": 5411970, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 2.109375, "step": 257, "time_per_iteration": 2.3912618160247803 }, { "auxiliary_loss_clip": 0.01261377, "auxiliary_loss_mlp": 0.01097662, "balance_loss_clip": 1.04547238, "balance_loss_mlp": 1.05512738, "epoch": 0.01551179918833609, "flos": 22600640073600.0, "grad_norm": 2.5275914771710566, "language_loss": 0.94030905, "learning_rate": 3.9976616445327355e-06, "loss": 0.96389937, "num_input_tokens_seen": 5430245, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 2.0625, "step": 258, "time_per_iteration": 2.523621082305908 }, { "auxiliary_loss_clip": 0.0125721, "auxiliary_loss_mlp": 0.01084906, "balance_loss_clip": 1.03304982, "balance_loss_mlp": 1.05085206, "epoch": 0.015571922441004058, "flos": 22815705248640.0, "grad_norm": 2.701701695211177, "language_loss": 0.92466164, "learning_rate": 3.9976433440457205e-06, "loss": 0.94808275, "num_input_tokens_seen": 5448905, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 2.0625, "step": 259, "time_per_iteration": 2.45497465133667 }, { "auxiliary_loss_clip": 0.01253468, "auxiliary_loss_mlp": 0.0108324, "balance_loss_clip": 1.03674841, "balance_loss_mlp": 1.05464232, "epoch": 0.015632045693672027, "flos": 18946595600640.0, "grad_norm": 1.7812673312303993, "language_loss": 0.96986514, "learning_rate": 3.997624972267954e-06, "loss": 0.99323225, "num_input_tokens_seen": 5466405, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 1.9921875, "step": 260, "time_per_iteration": 2.4638025760650635 }, { "auxiliary_loss_clip": 0.01264476, "auxiliary_loss_mlp": 0.01097613, "balance_loss_clip": 1.04480374, "balance_loss_mlp": 1.05541444, "epoch": 0.015692168946339995, "flos": 29970421125120.0, "grad_norm": 2.0705749401091733, "language_loss": 0.87201715, "learning_rate": 3.99760652920009e-06, "loss": 0.89563799, "num_input_tokens_seen": 5487055, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 2.09375, "step": 261, "time_per_iteration": 2.5186092853546143 }, { "auxiliary_loss_clip": 0.0126125, "auxiliary_loss_mlp": 0.01086343, "balance_loss_clip": 1.03577399, "balance_loss_mlp": 1.05366397, "epoch": 0.015752292199007967, "flos": 19391039608320.0, "grad_norm": 1.9911084105028154, "language_loss": 0.66606891, "learning_rate": 3.997588014842788e-06, "loss": 0.68954486, "num_input_tokens_seen": 5506600, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 2.078125, "step": 262, "time_per_iteration": 2.4510912895202637 }, { "auxiliary_loss_clip": 0.01257533, "auxiliary_loss_mlp": 0.01103056, "balance_loss_clip": 1.0508666, "balance_loss_mlp": 1.0543381, "epoch": 0.015812415451675936, "flos": 20338756997760.0, "grad_norm": 2.204412624175132, "language_loss": 0.6779955, "learning_rate": 3.997569429196708e-06, "loss": 0.70160139, "num_input_tokens_seen": 5524350, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 2.03125, "step": 263, "time_per_iteration": 2.4451043605804443 }, { "auxiliary_loss_clip": 0.01260264, "auxiliary_loss_mlp": 0.01090615, "balance_loss_clip": 1.03990364, "balance_loss_mlp": 1.05124879, "epoch": 0.015872538704343905, "flos": 17524583124480.0, "grad_norm": 2.9410747460535283, "language_loss": 0.84258455, "learning_rate": 3.997550772262513e-06, "loss": 0.86609334, "num_input_tokens_seen": 5542145, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 2.09375, "step": 264, "time_per_iteration": 2.4145796298980713 }, { "auxiliary_loss_clip": 0.01264681, "auxiliary_loss_mlp": 0.0108678, "balance_loss_clip": 1.03683114, "balance_loss_mlp": 1.05565, "epoch": 0.015932661957011873, "flos": 15259802405760.0, "grad_norm": 3.7930459922362205, "language_loss": 1.03443956, "learning_rate": 3.997532044040869e-06, "loss": 1.05795407, "num_input_tokens_seen": 5557920, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 2.078125, "step": 265, "time_per_iteration": 2.4558472633361816 }, { "auxiliary_loss_clip": 0.01265797, "auxiliary_loss_mlp": 0.01091431, "balance_loss_clip": 1.03757191, "balance_loss_mlp": 1.05655909, "epoch": 0.015992785209679845, "flos": 20301504710400.0, "grad_norm": 6.033841447363089, "language_loss": 0.74710017, "learning_rate": 3.997513244532445e-06, "loss": 0.77067244, "num_input_tokens_seen": 5576290, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 2.09375, "step": 266, "time_per_iteration": 2.552818775177002 }, { "auxiliary_loss_clip": 0.01253089, "auxiliary_loss_mlp": 0.01081126, "balance_loss_clip": 1.03139186, "balance_loss_mlp": 1.05281532, "epoch": 0.016052908462347814, "flos": 23361397799040.0, "grad_norm": 1.8207170903870495, "language_loss": 0.89983177, "learning_rate": 3.997494373737912e-06, "loss": 0.9231739, "num_input_tokens_seen": 5595205, "router_z_loss_clip": 0.49804688, "router_z_loss_mlp": 2.0, "step": 267, "time_per_iteration": 2.4479634761810303 }, { "auxiliary_loss_clip": 0.01263384, "auxiliary_loss_mlp": 0.01093478, "balance_loss_clip": 1.04264736, "balance_loss_mlp": 1.05432463, "epoch": 0.016113031715015783, "flos": 21285566691840.0, "grad_norm": 2.3169364211275987, "language_loss": 0.8484515, "learning_rate": 3.997475431657943e-06, "loss": 0.87202013, "num_input_tokens_seen": 5612645, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 2.09375, "step": 268, "time_per_iteration": 2.439906120300293 }, { "auxiliary_loss_clip": 0.01254088, "auxiliary_loss_mlp": 0.01081637, "balance_loss_clip": 1.03114021, "balance_loss_mlp": 1.05426097, "epoch": 0.01617315496768375, "flos": 18913742144640.0, "grad_norm": 2.45556176865787, "language_loss": 0.88008893, "learning_rate": 3.9974564182932135e-06, "loss": 0.9034462, "num_input_tokens_seen": 5628345, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 2.0, "step": 269, "time_per_iteration": 2.409527063369751 }, { "auxiliary_loss_clip": 0.0126133, "auxiliary_loss_mlp": 0.01089492, "balance_loss_clip": 1.03787434, "balance_loss_mlp": 1.05400348, "epoch": 0.01623327822035172, "flos": 16545513467520.0, "grad_norm": 2.600771597384193, "language_loss": 0.96567738, "learning_rate": 3.997437333644403e-06, "loss": 0.98918557, "num_input_tokens_seen": 5645940, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 2.078125, "step": 270, "time_per_iteration": 2.4156806468963623 }, { "auxiliary_loss_clip": 0.01258176, "auxiliary_loss_mlp": 0.0109777, "balance_loss_clip": 1.04743981, "balance_loss_mlp": 1.05751896, "epoch": 0.016293401473019692, "flos": 23512361984640.0, "grad_norm": 2.3087201569184472, "language_loss": 0.85398507, "learning_rate": 3.9974181777121915e-06, "loss": 0.87754458, "num_input_tokens_seen": 5665690, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 2.0, "step": 271, "time_per_iteration": 2.4505057334899902 }, { "auxiliary_loss_clip": 0.01259898, "auxiliary_loss_mlp": 0.01091065, "balance_loss_clip": 1.03847027, "balance_loss_mlp": 1.05401468, "epoch": 0.01635352472568766, "flos": 29014988325120.0, "grad_norm": 8.633948262091137, "language_loss": 0.80753708, "learning_rate": 3.997398950497263e-06, "loss": 0.8310467, "num_input_tokens_seen": 5683190, "router_z_loss_clip": 0.52734375, "router_z_loss_mlp": 2.0625, "step": 272, "time_per_iteration": 2.4930429458618164 }, { "auxiliary_loss_clip": 0.01254961, "auxiliary_loss_mlp": 0.01091194, "balance_loss_clip": 1.0410068, "balance_loss_mlp": 1.05233335, "epoch": 0.01641364797835563, "flos": 13369674153600.0, "grad_norm": 2.121148494629337, "language_loss": 0.80297101, "learning_rate": 3.9973796520003044e-06, "loss": 0.82643253, "num_input_tokens_seen": 5699780, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 2.03125, "step": 273, "time_per_iteration": 2.396026611328125 }, { "auxiliary_loss_clip": 0.01254567, "auxiliary_loss_mlp": 0.01091044, "balance_loss_clip": 1.03914022, "balance_loss_mlp": 1.05181062, "epoch": 0.016473771231023598, "flos": 18877292818560.0, "grad_norm": 2.2415439484369513, "language_loss": 0.90765822, "learning_rate": 3.997360282222004e-06, "loss": 0.93111444, "num_input_tokens_seen": 5716980, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 2.03125, "step": 274, "time_per_iteration": 2.4306771755218506 }, { "auxiliary_loss_clip": 0.01256585, "auxiliary_loss_mlp": 0.010984, "balance_loss_clip": 1.04537582, "balance_loss_mlp": 1.05353796, "epoch": 0.016533894483691566, "flos": 22600535339520.0, "grad_norm": 1.8461598322284212, "language_loss": 0.87523705, "learning_rate": 3.997340841163053e-06, "loss": 0.8987869, "num_input_tokens_seen": 5737780, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 2.03125, "step": 275, "time_per_iteration": 2.4361367225646973 }, { "auxiliary_loss_clip": 0.01259522, "auxiliary_loss_mlp": 0.01098736, "balance_loss_clip": 1.04533052, "balance_loss_mlp": 1.05604446, "epoch": 0.01659401773635954, "flos": 21506112950400.0, "grad_norm": 1.694260442445138, "language_loss": 0.80209416, "learning_rate": 3.9973213288241445e-06, "loss": 0.8256768, "num_input_tokens_seen": 5758330, "router_z_loss_clip": 0.53515625, "router_z_loss_mlp": 2.03125, "step": 276, "time_per_iteration": 2.4542930126190186 }, { "auxiliary_loss_clip": 0.01250956, "auxiliary_loss_mlp": 0.01088707, "balance_loss_clip": 1.04045093, "balance_loss_mlp": 1.05300093, "epoch": 0.016654140989027507, "flos": 32849673505920.0, "grad_norm": 1.7758054657349884, "language_loss": 0.80436337, "learning_rate": 3.997301745205976e-06, "loss": 0.82775998, "num_input_tokens_seen": 5778340, "router_z_loss_clip": 0.48242188, "router_z_loss_mlp": 1.984375, "step": 277, "time_per_iteration": 2.5271694660186768 }, { "auxiliary_loss_clip": 0.01251341, "auxiliary_loss_mlp": 0.01082503, "balance_loss_clip": 1.02964544, "balance_loss_mlp": 1.05135834, "epoch": 0.016714264241695476, "flos": 12305591602560.0, "grad_norm": 2.840228512453406, "language_loss": 0.79760599, "learning_rate": 3.997282090309246e-06, "loss": 0.82094443, "num_input_tokens_seen": 5794295, "router_z_loss_clip": 0.52734375, "router_z_loss_mlp": 2.0, "step": 278, "time_per_iteration": 2.4087443351745605 }, { "auxiliary_loss_clip": 0.0125048, "auxiliary_loss_mlp": 0.01082849, "balance_loss_clip": 1.03437805, "balance_loss_mlp": 1.05186558, "epoch": 0.016774387494363444, "flos": 27122625745920.0, "grad_norm": 1.9624673467401972, "language_loss": 0.90430892, "learning_rate": 3.9972623641346555e-06, "loss": 0.92764223, "num_input_tokens_seen": 5814405, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 1.984375, "step": 279, "time_per_iteration": 2.4882657527923584 }, { "auxiliary_loss_clip": 0.01252485, "auxiliary_loss_mlp": 0.01087283, "balance_loss_clip": 1.03421128, "balance_loss_mlp": 1.05146337, "epoch": 0.016834510747031413, "flos": 20190515898240.0, "grad_norm": 3.3592851885107806, "language_loss": 0.93480706, "learning_rate": 3.9972425666829085e-06, "loss": 0.95820475, "num_input_tokens_seen": 5832795, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 2.0, "step": 280, "time_per_iteration": 2.450284481048584 }, { "auxiliary_loss_clip": 0.0125678, "auxiliary_loss_mlp": 0.01087997, "balance_loss_clip": 1.03652251, "balance_loss_mlp": 1.0515883, "epoch": 0.016894633999699385, "flos": 27272961527040.0, "grad_norm": 2.204190539641557, "language_loss": 0.73594493, "learning_rate": 3.997222697954712e-06, "loss": 0.75939268, "num_input_tokens_seen": 5855750, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 2.0625, "step": 281, "time_per_iteration": 2.5007758140563965 }, { "auxiliary_loss_clip": 0.01255022, "auxiliary_loss_mlp": 0.01095399, "balance_loss_clip": 1.04506898, "balance_loss_mlp": 1.05505633, "epoch": 0.016954757252367354, "flos": 14902081948800.0, "grad_norm": 2.637264410239938, "language_loss": 0.79733199, "learning_rate": 3.997202757950775e-06, "loss": 0.82083619, "num_input_tokens_seen": 5872610, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 2.0, "step": 282, "time_per_iteration": 2.4414730072021484 }, { "auxiliary_loss_clip": 0.01256517, "auxiliary_loss_mlp": 0.01099689, "balance_loss_clip": 1.04740334, "balance_loss_mlp": 1.05429316, "epoch": 0.017014880505035322, "flos": 21357802028160.0, "grad_norm": 2.070819037653251, "language_loss": 0.77117169, "learning_rate": 3.997182746671809e-06, "loss": 0.79473376, "num_input_tokens_seen": 5892985, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 2.015625, "step": 283, "time_per_iteration": 2.445768117904663 }, { "auxiliary_loss_clip": 0.01257915, "auxiliary_loss_mlp": 0.01084586, "balance_loss_clip": 1.03706956, "balance_loss_mlp": 1.05535746, "epoch": 0.01707500375770329, "flos": 35331753726720.0, "grad_norm": 2.1373580986706613, "language_loss": 0.83959854, "learning_rate": 3.997162664118528e-06, "loss": 0.86302352, "num_input_tokens_seen": 5914060, "router_z_loss_clip": 0.47460938, "router_z_loss_mlp": 2.03125, "step": 284, "time_per_iteration": 2.573779344558716 }, { "auxiliary_loss_clip": 0.01247415, "auxiliary_loss_mlp": 0.01085849, "balance_loss_clip": 1.03487539, "balance_loss_mlp": 1.04878318, "epoch": 0.01713512701037126, "flos": 23581071273600.0, "grad_norm": 2.3576344067130917, "language_loss": 0.96618634, "learning_rate": 3.99714251029165e-06, "loss": 0.989519, "num_input_tokens_seen": 5932860, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 1.984375, "step": 285, "time_per_iteration": 2.444382667541504 }, { "auxiliary_loss_clip": 0.0125138, "auxiliary_loss_mlp": 0.01083147, "balance_loss_clip": 1.0370605, "balance_loss_mlp": 1.05309486, "epoch": 0.01719525026303923, "flos": 27633474892800.0, "grad_norm": 8.623728045445985, "language_loss": 0.93435287, "learning_rate": 3.997122285191892e-06, "loss": 0.95769811, "num_input_tokens_seen": 5952725, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 1.984375, "step": 286, "time_per_iteration": 2.5120911598205566 }, { "auxiliary_loss_clip": 0.01248755, "auxiliary_loss_mlp": 0.01089959, "balance_loss_clip": 1.03903317, "balance_loss_mlp": 1.05168724, "epoch": 0.0172553735157072, "flos": 26978504186880.0, "grad_norm": 2.0538479761604704, "language_loss": 0.91652668, "learning_rate": 3.997101988819976e-06, "loss": 0.93991387, "num_input_tokens_seen": 5970560, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 1.96875, "step": 287, "time_per_iteration": 2.463068723678589 }, { "auxiliary_loss_clip": 0.01250456, "auxiliary_loss_mlp": 0.01078526, "balance_loss_clip": 1.03065181, "balance_loss_mlp": 1.05379772, "epoch": 0.01731549676837517, "flos": 14055962215680.0, "grad_norm": 3.2909414233324563, "language_loss": 1.01652026, "learning_rate": 3.997081621176629e-06, "loss": 1.03981018, "num_input_tokens_seen": 5982980, "router_z_loss_clip": 0.47851562, "router_z_loss_mlp": 1.96875, "step": 288, "time_per_iteration": 2.422941207885742 }, { "auxiliary_loss_clip": 0.01164027, "auxiliary_loss_mlp": 0.01052735, "balance_loss_clip": 1.03757143, "balance_loss_mlp": 1.06516135, "epoch": 0.017375620021043137, "flos": 66506885959680.0, "grad_norm": 0.9025926466434199, "language_loss": 0.63966572, "learning_rate": 3.997061182262575e-06, "loss": 0.66183341, "num_input_tokens_seen": 6049445, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.98828125, "step": 289, "time_per_iteration": 3.1877288818359375 }, { "auxiliary_loss_clip": 0.01247348, "auxiliary_loss_mlp": 0.01082034, "balance_loss_clip": 1.03451705, "balance_loss_mlp": 1.05231452, "epoch": 0.01743574327371111, "flos": 15224435331840.0, "grad_norm": 3.2195614434280504, "language_loss": 0.88081455, "learning_rate": 3.997040672078545e-06, "loss": 0.90410841, "num_input_tokens_seen": 6064150, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 1.9453125, "step": 290, "time_per_iteration": 2.4281256198883057 }, { "auxiliary_loss_clip": 0.01248587, "auxiliary_loss_mlp": 0.01079803, "balance_loss_clip": 1.03347826, "balance_loss_mlp": 1.05308676, "epoch": 0.017495866526379078, "flos": 25372708980480.0, "grad_norm": 2.006050207544469, "language_loss": 0.83882666, "learning_rate": 3.997020090625269e-06, "loss": 0.86211061, "num_input_tokens_seen": 6083920, "router_z_loss_clip": 0.46289062, "router_z_loss_mlp": 1.953125, "step": 291, "time_per_iteration": 2.4570248126983643 }, { "auxiliary_loss_clip": 0.0125354, "auxiliary_loss_mlp": 0.01099016, "balance_loss_clip": 1.04727936, "balance_loss_mlp": 1.05850148, "epoch": 0.017555989779047047, "flos": 26358272150400.0, "grad_norm": 1.7788976427156116, "language_loss": 0.72319877, "learning_rate": 3.996999437903485e-06, "loss": 0.74672437, "num_input_tokens_seen": 6105460, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 1.9453125, "step": 292, "time_per_iteration": 3.9080538749694824 }, { "auxiliary_loss_clip": 0.01247077, "auxiliary_loss_mlp": 0.01092912, "balance_loss_clip": 1.04296315, "balance_loss_mlp": 1.05412185, "epoch": 0.017616113031715015, "flos": 22337919025920.0, "grad_norm": 2.160237997478328, "language_loss": 0.86493468, "learning_rate": 3.996978713913927e-06, "loss": 0.88833451, "num_input_tokens_seen": 6122890, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 1.9296875, "step": 293, "time_per_iteration": 5.241613388061523 }, { "auxiliary_loss_clip": 0.01246615, "auxiliary_loss_mlp": 0.01079838, "balance_loss_clip": 1.03301287, "balance_loss_mlp": 1.05239558, "epoch": 0.017676236284382984, "flos": 20155881962880.0, "grad_norm": 3.2612184299665374, "language_loss": 0.80483878, "learning_rate": 3.996957918657335e-06, "loss": 0.8281033, "num_input_tokens_seen": 6142890, "router_z_loss_clip": 0.46875, "router_z_loss_mlp": 1.9375, "step": 294, "time_per_iteration": 3.8060572147369385 }, { "auxiliary_loss_clip": 0.0124757, "auxiliary_loss_mlp": 0.01089644, "balance_loss_clip": 1.03969526, "balance_loss_mlp": 1.05109572, "epoch": 0.017736359537050956, "flos": 25222303376640.0, "grad_norm": 2.5812548654115948, "language_loss": 0.83908248, "learning_rate": 3.996937052134452e-06, "loss": 0.86245465, "num_input_tokens_seen": 6162030, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 1.96875, "step": 295, "time_per_iteration": 2.4810874462127686 }, { "auxiliary_loss_clip": 0.01248576, "auxiliary_loss_mlp": 0.01091159, "balance_loss_clip": 1.04524004, "balance_loss_mlp": 1.05808198, "epoch": 0.017796482789718925, "flos": 20337779479680.0, "grad_norm": 2.0460008989613, "language_loss": 0.83856666, "learning_rate": 3.996916114346023e-06, "loss": 0.86196399, "num_input_tokens_seen": 6180540, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 1.90625, "step": 296, "time_per_iteration": 2.4240550994873047 }, { "auxiliary_loss_clip": 0.01255662, "auxiliary_loss_mlp": 0.0108459, "balance_loss_clip": 1.03647757, "balance_loss_mlp": 1.05740452, "epoch": 0.017856606042386893, "flos": 22378208601600.0, "grad_norm": 2.4621250106449386, "language_loss": 0.87520307, "learning_rate": 3.996895105292794e-06, "loss": 0.89860559, "num_input_tokens_seen": 6199425, "router_z_loss_clip": 0.48046875, "router_z_loss_mlp": 1.984375, "step": 297, "time_per_iteration": 2.4576117992401123 }, { "auxiliary_loss_clip": 0.01250089, "auxiliary_loss_mlp": 0.01076338, "balance_loss_clip": 1.03039491, "balance_loss_mlp": 1.05379272, "epoch": 0.017916729295054862, "flos": 20229024994560.0, "grad_norm": 2.270374893224995, "language_loss": 0.88099438, "learning_rate": 3.996874024975515e-06, "loss": 0.90425873, "num_input_tokens_seen": 6219170, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 1.9609375, "step": 298, "time_per_iteration": 2.444200277328491 }, { "auxiliary_loss_clip": 0.0124723, "auxiliary_loss_mlp": 0.01087312, "balance_loss_clip": 1.03714836, "balance_loss_mlp": 1.05417967, "epoch": 0.01797685254772283, "flos": 19389957356160.0, "grad_norm": 2.25462221963985, "language_loss": 0.88106245, "learning_rate": 3.996852873394939e-06, "loss": 0.90440786, "num_input_tokens_seen": 6237930, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 1.9296875, "step": 299, "time_per_iteration": 2.4332218170166016 }, { "auxiliary_loss_clip": 0.01257521, "auxiliary_loss_mlp": 0.01077923, "balance_loss_clip": 1.02790248, "balance_loss_mlp": 1.05698752, "epoch": 0.018036975800390802, "flos": 24424851945600.0, "grad_norm": 3.01009565599283, "language_loss": 0.63656032, "learning_rate": 3.996831650551821e-06, "loss": 0.65991479, "num_input_tokens_seen": 6257170, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 2.0, "step": 300, "time_per_iteration": 2.512960433959961 }, { "auxiliary_loss_clip": 0.01250638, "auxiliary_loss_mlp": 0.01089998, "balance_loss_clip": 1.04167092, "balance_loss_mlp": 1.05774963, "epoch": 0.01809709905305877, "flos": 15778017849600.0, "grad_norm": 2.705806939759899, "language_loss": 0.87975717, "learning_rate": 3.996810356446917e-06, "loss": 0.90316349, "num_input_tokens_seen": 6274780, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 1.9296875, "step": 301, "time_per_iteration": 2.449906826019287 }, { "auxiliary_loss_clip": 0.01170518, "auxiliary_loss_mlp": 0.01035771, "balance_loss_clip": 1.02175176, "balance_loss_mlp": 1.07485867, "epoch": 0.01815722230572674, "flos": 67344592055040.0, "grad_norm": 0.9665553873254724, "language_loss": 0.62200117, "learning_rate": 3.996788991080988e-06, "loss": 0.64406407, "num_input_tokens_seen": 6340435, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.95703125, "step": 302, "time_per_iteration": 3.235015392303467 }, { "auxiliary_loss_clip": 0.01245816, "auxiliary_loss_mlp": 0.01094046, "balance_loss_clip": 1.04636252, "balance_loss_mlp": 1.04997766, "epoch": 0.01821734555839471, "flos": 15484747495680.0, "grad_norm": 2.2391786324512637, "language_loss": 0.89078534, "learning_rate": 3.996767554454796e-06, "loss": 0.91418391, "num_input_tokens_seen": 6358160, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 1.9609375, "step": 303, "time_per_iteration": 2.4486289024353027 }, { "auxiliary_loss_clip": 0.01252791, "auxiliary_loss_mlp": 0.01097796, "balance_loss_clip": 1.04799032, "balance_loss_mlp": 1.05749726, "epoch": 0.018277468811062677, "flos": 24096284340480.0, "grad_norm": 1.7268145183362635, "language_loss": 0.79628664, "learning_rate": 3.996746046569107e-06, "loss": 0.81979251, "num_input_tokens_seen": 6378485, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 1.953125, "step": 304, "time_per_iteration": 2.498051643371582 }, { "auxiliary_loss_clip": 0.01242654, "auxiliary_loss_mlp": 0.01073108, "balance_loss_clip": 1.02733171, "balance_loss_mlp": 1.05657101, "epoch": 0.01833759206373065, "flos": 20958290807040.0, "grad_norm": 1.6486581714926711, "language_loss": 0.82408345, "learning_rate": 3.996724467424687e-06, "loss": 0.84724116, "num_input_tokens_seen": 6397845, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 1.859375, "step": 305, "time_per_iteration": 2.4394264221191406 }, { "auxiliary_loss_clip": 0.0124937, "auxiliary_loss_mlp": 0.01077638, "balance_loss_clip": 1.03140867, "balance_loss_mlp": 1.05194199, "epoch": 0.018397715316398618, "flos": 19389747888000.0, "grad_norm": 1.952855863430056, "language_loss": 0.90433657, "learning_rate": 3.996702817022308e-06, "loss": 0.9276067, "num_input_tokens_seen": 6416475, "router_z_loss_clip": 0.46289062, "router_z_loss_mlp": 1.9765625, "step": 306, "time_per_iteration": 2.4558498859405518 }, { "auxiliary_loss_clip": 0.01239261, "auxiliary_loss_mlp": 0.0108027, "balance_loss_clip": 1.03404021, "balance_loss_mlp": 1.04859948, "epoch": 0.018457838569066586, "flos": 29131248752640.0, "grad_norm": 2.143075141284067, "language_loss": 0.86084306, "learning_rate": 3.996681095362741e-06, "loss": 0.88403845, "num_input_tokens_seen": 6437520, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 1.90625, "step": 307, "time_per_iteration": 2.537835121154785 }, { "auxiliary_loss_clip": 0.01241134, "auxiliary_loss_mlp": 0.01080609, "balance_loss_clip": 1.03178144, "balance_loss_mlp": 1.05186296, "epoch": 0.018517961821734555, "flos": 19207640903040.0, "grad_norm": 2.3437069898355904, "language_loss": 0.71195388, "learning_rate": 3.996659302446762e-06, "loss": 0.73517132, "num_input_tokens_seen": 6455680, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 1.890625, "step": 308, "time_per_iteration": 2.4432265758514404 }, { "auxiliary_loss_clip": 0.01246949, "auxiliary_loss_mlp": 0.01087794, "balance_loss_clip": 1.04046774, "balance_loss_mlp": 1.05282617, "epoch": 0.018578085074402523, "flos": 19862053027200.0, "grad_norm": 2.6018867667266163, "language_loss": 0.91403347, "learning_rate": 3.996637438275148e-06, "loss": 0.93738091, "num_input_tokens_seen": 6474880, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 1.9375, "step": 309, "time_per_iteration": 2.4650886058807373 }, { "auxiliary_loss_clip": 0.0125455, "auxiliary_loss_mlp": 0.01084848, "balance_loss_clip": 1.03559089, "balance_loss_mlp": 1.05105126, "epoch": 0.018638208327070496, "flos": 29605648573440.0, "grad_norm": 1.9363741747675771, "language_loss": 0.72133344, "learning_rate": 3.99661550284868e-06, "loss": 0.74472737, "num_input_tokens_seen": 6495945, "router_z_loss_clip": 0.4921875, "router_z_loss_mlp": 2.03125, "step": 310, "time_per_iteration": 2.515129566192627 }, { "auxiliary_loss_clip": 0.01245354, "auxiliary_loss_mlp": 0.01099865, "balance_loss_clip": 1.05294418, "balance_loss_mlp": 1.05505824, "epoch": 0.018698331579738464, "flos": 45729866131200.0, "grad_norm": 2.0692631349636943, "language_loss": 0.73453295, "learning_rate": 3.996593496168141e-06, "loss": 0.75798512, "num_input_tokens_seen": 6519930, "router_z_loss_clip": 0.46875, "router_z_loss_mlp": 1.8984375, "step": 311, "time_per_iteration": 2.6794707775115967 }, { "auxiliary_loss_clip": 0.01254028, "auxiliary_loss_mlp": 0.01086933, "balance_loss_clip": 1.04008377, "balance_loss_mlp": 1.05406392, "epoch": 0.018758454832406433, "flos": 20482669088640.0, "grad_norm": 3.6044382373216695, "language_loss": 0.90822446, "learning_rate": 3.996571418234316e-06, "loss": 0.93163407, "num_input_tokens_seen": 6535070, "router_z_loss_clip": 0.46875, "router_z_loss_mlp": 2.0, "step": 312, "time_per_iteration": 2.454665184020996 }, { "auxiliary_loss_clip": 0.01253057, "auxiliary_loss_mlp": 0.01093846, "balance_loss_clip": 1.04518485, "balance_loss_mlp": 1.05319011, "epoch": 0.0188185780850744, "flos": 15776900686080.0, "grad_norm": 2.1685699705162365, "language_loss": 0.89634204, "learning_rate": 3.996549269047992e-06, "loss": 0.91981101, "num_input_tokens_seen": 6554135, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 2.0, "step": 313, "time_per_iteration": 2.413755178451538 }, { "auxiliary_loss_clip": 0.01254911, "auxiliary_loss_mlp": 0.01084852, "balance_loss_clip": 1.03831267, "balance_loss_mlp": 1.0554781, "epoch": 0.018878701337742373, "flos": 22454633301120.0, "grad_norm": 2.2131754408423623, "language_loss": 0.72605658, "learning_rate": 3.996527048609961e-06, "loss": 0.7494542, "num_input_tokens_seen": 6572275, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 1.9921875, "step": 314, "time_per_iteration": 2.461585283279419 }, { "auxiliary_loss_clip": 0.01247425, "auxiliary_loss_mlp": 0.01093213, "balance_loss_clip": 1.04669785, "balance_loss_mlp": 1.0528543, "epoch": 0.018938824590410342, "flos": 30992189241600.0, "grad_norm": 2.4088693693289045, "language_loss": 0.88752794, "learning_rate": 3.996504756921015e-06, "loss": 0.91093433, "num_input_tokens_seen": 6594520, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 1.9453125, "step": 315, "time_per_iteration": 2.527616024017334 }, { "auxiliary_loss_clip": 0.01245421, "auxiliary_loss_mlp": 0.0107802, "balance_loss_clip": 1.03062224, "balance_loss_mlp": 1.05360627, "epoch": 0.01899894784307831, "flos": 23257775283840.0, "grad_norm": 1.883027643759866, "language_loss": 0.80180895, "learning_rate": 3.996482393981951e-06, "loss": 0.82504332, "num_input_tokens_seen": 6614245, "router_z_loss_clip": 0.47460938, "router_z_loss_mlp": 1.921875, "step": 316, "time_per_iteration": 2.5028390884399414 }, { "auxiliary_loss_clip": 0.01244454, "auxiliary_loss_mlp": 0.01081262, "balance_loss_clip": 1.03372157, "balance_loss_mlp": 1.05157375, "epoch": 0.01905907109574628, "flos": 17456921176320.0, "grad_norm": 2.341369116632892, "language_loss": 0.89989537, "learning_rate": 3.996459959793564e-06, "loss": 0.92315257, "num_input_tokens_seen": 6632015, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 1.9296875, "step": 317, "time_per_iteration": 2.4208786487579346 }, { "auxiliary_loss_clip": 0.01239179, "auxiliary_loss_mlp": 0.01081755, "balance_loss_clip": 1.03349936, "balance_loss_mlp": 1.04840732, "epoch": 0.019119194348414248, "flos": 14969499517440.0, "grad_norm": 4.3113874763269395, "language_loss": 0.90558648, "learning_rate": 3.996437454356658e-06, "loss": 0.92879575, "num_input_tokens_seen": 6649015, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 1.90625, "step": 318, "time_per_iteration": 2.414809226989746 }, { "auxiliary_loss_clip": 0.01241514, "auxiliary_loss_mlp": 0.01070685, "balance_loss_clip": 1.02605319, "balance_loss_mlp": 1.0510509, "epoch": 0.01917931760108222, "flos": 25481672933760.0, "grad_norm": 6.179517827759149, "language_loss": 0.93067336, "learning_rate": 3.996414877672034e-06, "loss": 0.95379531, "num_input_tokens_seen": 6669225, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 1.90625, "step": 319, "time_per_iteration": 2.4777655601501465 }, { "auxiliary_loss_clip": 0.01141162, "auxiliary_loss_mlp": 0.01034181, "balance_loss_clip": 1.02130675, "balance_loss_mlp": 1.05140138, "epoch": 0.01923944085375019, "flos": 71553722100480.0, "grad_norm": 0.9027108154994729, "language_loss": 0.59722847, "learning_rate": 3.996392229740498e-06, "loss": 0.6189819, "num_input_tokens_seen": 6725775, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.8984375, "step": 320, "time_per_iteration": 3.004598379135132 }, { "auxiliary_loss_clip": 0.0123998, "auxiliary_loss_mlp": 0.01086659, "balance_loss_clip": 1.03847432, "balance_loss_mlp": 1.0485394, "epoch": 0.019299564106418157, "flos": 19681482142080.0, "grad_norm": 3.3648702202179, "language_loss": 0.89259684, "learning_rate": 3.99636951056286e-06, "loss": 0.91586322, "num_input_tokens_seen": 6744170, "router_z_loss_clip": 0.48242188, "router_z_loss_mlp": 1.9140625, "step": 321, "time_per_iteration": 2.4724040031433105 }, { "auxiliary_loss_clip": 0.01247364, "auxiliary_loss_mlp": 0.0109448, "balance_loss_clip": 1.04455495, "balance_loss_mlp": 1.05249262, "epoch": 0.019359687359086126, "flos": 24386063558400.0, "grad_norm": 2.611546329556763, "language_loss": 0.82608497, "learning_rate": 3.996346720139928e-06, "loss": 0.84950346, "num_input_tokens_seen": 6764565, "router_z_loss_clip": 0.49804688, "router_z_loss_mlp": 1.9453125, "step": 322, "time_per_iteration": 2.472501039505005 }, { "auxiliary_loss_clip": 0.0124952, "auxiliary_loss_mlp": 0.01084466, "balance_loss_clip": 1.03673482, "balance_loss_mlp": 1.05295157, "epoch": 0.019419810611754094, "flos": 23950242656640.0, "grad_norm": 2.3185725465210574, "language_loss": 0.72154129, "learning_rate": 3.996323858472518e-06, "loss": 0.74488103, "num_input_tokens_seen": 6785310, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 1.96875, "step": 323, "time_per_iteration": 2.49354887008667 }, { "auxiliary_loss_clip": 0.01238313, "auxiliary_loss_mlp": 0.0107448, "balance_loss_clip": 1.02827406, "balance_loss_mlp": 1.04739833, "epoch": 0.019479933864422067, "flos": 22159233354240.0, "grad_norm": 1.9805260952758794, "language_loss": 0.92195767, "learning_rate": 3.996300925561445e-06, "loss": 0.94508559, "num_input_tokens_seen": 6803290, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 1.90625, "step": 324, "time_per_iteration": 2.4647061824798584 }, { "auxiliary_loss_clip": 0.01249478, "auxiliary_loss_mlp": 0.01078997, "balance_loss_clip": 1.03441286, "balance_loss_mlp": 1.05450225, "epoch": 0.019540057117090035, "flos": 22235727876480.0, "grad_norm": 5.209329635238033, "language_loss": 0.64969045, "learning_rate": 3.996277921407525e-06, "loss": 0.67297518, "num_input_tokens_seen": 6822570, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 1.9453125, "step": 325, "time_per_iteration": 2.4915707111358643 }, { "auxiliary_loss_clip": 0.01247938, "auxiliary_loss_mlp": 0.01081587, "balance_loss_clip": 1.03504801, "balance_loss_mlp": 1.05839956, "epoch": 0.019600180369758004, "flos": 23075633387520.0, "grad_norm": 3.213050020436582, "language_loss": 0.76240909, "learning_rate": 3.996254846011582e-06, "loss": 0.78570437, "num_input_tokens_seen": 6841910, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 1.890625, "step": 326, "time_per_iteration": 2.4705817699432373 }, { "auxiliary_loss_clip": 0.01243276, "auxiliary_loss_mlp": 0.01090596, "balance_loss_clip": 1.04548764, "balance_loss_mlp": 1.05709124, "epoch": 0.019660303622425972, "flos": 25409681976960.0, "grad_norm": 3.3797375509014422, "language_loss": 0.79048991, "learning_rate": 3.99623169937444e-06, "loss": 0.81382859, "num_input_tokens_seen": 6862480, "router_z_loss_clip": 0.45117188, "router_z_loss_mlp": 1.8671875, "step": 327, "time_per_iteration": 2.4997310638427734 }, { "auxiliary_loss_clip": 0.01250367, "auxiliary_loss_mlp": 0.01083052, "balance_loss_clip": 1.03579772, "balance_loss_mlp": 1.05862069, "epoch": 0.01972042687509394, "flos": 23656448632320.0, "grad_norm": 2.246546113724492, "language_loss": 0.80648839, "learning_rate": 3.996208481496923e-06, "loss": 0.82982254, "num_input_tokens_seen": 6882015, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 1.921875, "step": 328, "time_per_iteration": 2.4818077087402344 }, { "auxiliary_loss_clip": 0.01246085, "auxiliary_loss_mlp": 0.01091705, "balance_loss_clip": 1.04483187, "balance_loss_mlp": 1.05255795, "epoch": 0.019780550127761913, "flos": 18222496669440.0, "grad_norm": 3.1820907475393647, "language_loss": 0.93123943, "learning_rate": 3.996185192379858e-06, "loss": 0.95461732, "num_input_tokens_seen": 6899785, "router_z_loss_clip": 0.46875, "router_z_loss_mlp": 1.9375, "step": 329, "time_per_iteration": 2.426272392272949 }, { "auxiliary_loss_clip": 0.01251169, "auxiliary_loss_mlp": 0.01090414, "balance_loss_clip": 1.0451622, "balance_loss_mlp": 1.05434823, "epoch": 0.01984067338042988, "flos": 22417695216000.0, "grad_norm": 2.3763564534737656, "language_loss": 0.74106705, "learning_rate": 3.996161832024081e-06, "loss": 0.76448292, "num_input_tokens_seen": 6918575, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 1.96875, "step": 330, "time_per_iteration": 2.4598453044891357 }, { "auxiliary_loss_clip": 0.01249553, "auxiliary_loss_mlp": 0.01101458, "balance_loss_clip": 1.05377388, "balance_loss_mlp": 1.05429959, "epoch": 0.01990079663309785, "flos": 17054267932800.0, "grad_norm": 2.746237667836226, "language_loss": 0.92803168, "learning_rate": 3.996138400430422e-06, "loss": 0.95154178, "num_input_tokens_seen": 6936965, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 1.953125, "step": 331, "time_per_iteration": 3.8557064533233643 }, { "auxiliary_loss_clip": 0.01237676, "auxiliary_loss_mlp": 0.01075151, "balance_loss_clip": 1.03235459, "balance_loss_mlp": 1.05177975, "epoch": 0.01996091988576582, "flos": 15960857973120.0, "grad_norm": 3.6761560054958493, "language_loss": 0.92356098, "learning_rate": 3.996114897599718e-06, "loss": 0.94668925, "num_input_tokens_seen": 6953475, "router_z_loss_clip": 0.42773438, "router_z_loss_mlp": 1.859375, "step": 332, "time_per_iteration": 3.852618455886841 }, { "auxiliary_loss_clip": 0.01240775, "auxiliary_loss_mlp": 0.01078938, "balance_loss_clip": 1.0318985, "balance_loss_mlp": 1.05442214, "epoch": 0.02002104313843379, "flos": 23585330459520.0, "grad_norm": 2.542787881565984, "language_loss": 0.74470538, "learning_rate": 3.996091323532807e-06, "loss": 0.76790249, "num_input_tokens_seen": 6971630, "router_z_loss_clip": 0.47070312, "router_z_loss_mlp": 1.8671875, "step": 333, "time_per_iteration": 3.8072381019592285 }, { "auxiliary_loss_clip": 0.01244784, "auxiliary_loss_mlp": 0.01077039, "balance_loss_clip": 1.03154898, "balance_loss_mlp": 1.05381489, "epoch": 0.02008116639110176, "flos": 34093454158080.0, "grad_norm": 2.2161167871731506, "language_loss": 0.78280437, "learning_rate": 3.996067678230532e-06, "loss": 0.80602264, "num_input_tokens_seen": 6992775, "router_z_loss_clip": 0.45507812, "router_z_loss_mlp": 1.90625, "step": 334, "time_per_iteration": 3.9226768016815186 }, { "auxiliary_loss_clip": 0.01243757, "auxiliary_loss_mlp": 0.0107732, "balance_loss_clip": 1.03049493, "balance_loss_mlp": 1.04973888, "epoch": 0.020141289643769728, "flos": 19682669128320.0, "grad_norm": 1.8925571247576105, "language_loss": 0.82887501, "learning_rate": 3.996043961693736e-06, "loss": 0.85208577, "num_input_tokens_seen": 7011425, "router_z_loss_clip": 0.46875, "router_z_loss_mlp": 1.9375, "step": 335, "time_per_iteration": 2.5068249702453613 }, { "auxiliary_loss_clip": 0.01240044, "auxiliary_loss_mlp": 0.01073121, "balance_loss_clip": 1.02844167, "balance_loss_mlp": 1.05044055, "epoch": 0.020201412896437697, "flos": 20739525027840.0, "grad_norm": 2.570482808050445, "language_loss": 0.9190805, "learning_rate": 3.996020173923266e-06, "loss": 0.9422121, "num_input_tokens_seen": 7029450, "router_z_loss_clip": 0.44726562, "router_z_loss_mlp": 1.8984375, "step": 336, "time_per_iteration": 2.420025587081909 }, { "auxiliary_loss_clip": 0.01243367, "auxiliary_loss_mlp": 0.01077506, "balance_loss_clip": 1.03118145, "balance_loss_mlp": 1.0501318, "epoch": 0.020261536149105665, "flos": 20265474320640.0, "grad_norm": 2.029482100880366, "language_loss": 0.87759602, "learning_rate": 3.99599631491997e-06, "loss": 0.90080476, "num_input_tokens_seen": 7047555, "router_z_loss_clip": 0.46289062, "router_z_loss_mlp": 1.9296875, "step": 337, "time_per_iteration": 2.4157845973968506 }, { "auxiliary_loss_clip": 0.01236485, "auxiliary_loss_mlp": 0.01086757, "balance_loss_clip": 1.04064679, "balance_loss_mlp": 1.04916739, "epoch": 0.020321659401773638, "flos": 25847562648960.0, "grad_norm": 1.5226922127191085, "language_loss": 0.89615571, "learning_rate": 3.995972384684699e-06, "loss": 0.91938806, "num_input_tokens_seen": 7068185, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 1.875, "step": 338, "time_per_iteration": 2.4866647720336914 }, { "auxiliary_loss_clip": 0.01238625, "auxiliary_loss_mlp": 0.01073964, "balance_loss_clip": 1.02556515, "balance_loss_mlp": 1.04621446, "epoch": 0.020381782654441606, "flos": 17494033818240.0, "grad_norm": 2.4574020059875217, "language_loss": 0.84838378, "learning_rate": 3.995948383218309e-06, "loss": 0.87150961, "num_input_tokens_seen": 7085955, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 1.921875, "step": 339, "time_per_iteration": 2.436677932739258 }, { "auxiliary_loss_clip": 0.01243425, "auxiliary_loss_mlp": 0.01085429, "balance_loss_clip": 1.0390569, "balance_loss_mlp": 1.05143118, "epoch": 0.020441905907109575, "flos": 24242779872000.0, "grad_norm": 1.9556810807536034, "language_loss": 0.88591182, "learning_rate": 3.995924310521655e-06, "loss": 0.90920031, "num_input_tokens_seen": 7106345, "router_z_loss_clip": 0.46289062, "router_z_loss_mlp": 1.921875, "step": 340, "time_per_iteration": 2.4788992404937744 }, { "auxiliary_loss_clip": 0.01240924, "auxiliary_loss_mlp": 0.01079023, "balance_loss_clip": 1.03181577, "balance_loss_mlp": 1.05091405, "epoch": 0.020502029159777543, "flos": 22232306563200.0, "grad_norm": 2.1547913145760376, "language_loss": 0.87746286, "learning_rate": 3.995900166595596e-06, "loss": 0.9006623, "num_input_tokens_seen": 7125070, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 1.8984375, "step": 341, "time_per_iteration": 2.4392197132110596 }, { "auxiliary_loss_clip": 0.01244465, "auxiliary_loss_mlp": 0.01084104, "balance_loss_clip": 1.0366106, "balance_loss_mlp": 1.05005693, "epoch": 0.020562152412445512, "flos": 23986726894080.0, "grad_norm": 2.1928377266058137, "language_loss": 0.79686862, "learning_rate": 3.995875951440995e-06, "loss": 0.82015431, "num_input_tokens_seen": 7144675, "router_z_loss_clip": 0.47460938, "router_z_loss_mlp": 1.9375, "step": 342, "time_per_iteration": 2.454228162765503 }, { "auxiliary_loss_clip": 0.01231521, "auxiliary_loss_mlp": 0.01077618, "balance_loss_clip": 1.03029144, "balance_loss_mlp": 1.04603922, "epoch": 0.020622275665113484, "flos": 26974210089600.0, "grad_norm": 1.8821152658598543, "language_loss": 0.8900106, "learning_rate": 3.995851665058715e-06, "loss": 0.91310197, "num_input_tokens_seen": 7165505, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 1.8515625, "step": 343, "time_per_iteration": 2.4975695610046387 }, { "auxiliary_loss_clip": 0.01247423, "auxiliary_loss_mlp": 0.0108777, "balance_loss_clip": 1.04194629, "balance_loss_mlp": 1.05635118, "epoch": 0.020682398917781453, "flos": 22599627644160.0, "grad_norm": 2.4061841581096366, "language_loss": 0.77623147, "learning_rate": 3.995827307449623e-06, "loss": 0.79958338, "num_input_tokens_seen": 7184605, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 1.90625, "step": 344, "time_per_iteration": 2.500241279602051 }, { "auxiliary_loss_clip": 0.01228052, "auxiliary_loss_mlp": 0.01089187, "balance_loss_clip": 1.04031157, "balance_loss_mlp": 1.04616785, "epoch": 0.02074252217044942, "flos": 15012686736000.0, "grad_norm": 2.1031120440135336, "language_loss": 0.74481457, "learning_rate": 3.995802878614588e-06, "loss": 0.76798695, "num_input_tokens_seen": 7203065, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 1.8203125, "step": 345, "time_per_iteration": 2.4321231842041016 }, { "auxiliary_loss_clip": 0.01236457, "auxiliary_loss_mlp": 0.01083525, "balance_loss_clip": 1.03290892, "balance_loss_mlp": 1.05203187, "epoch": 0.02080264542311739, "flos": 25336783324800.0, "grad_norm": 2.1070814705930667, "language_loss": 0.89819229, "learning_rate": 3.995778378554483e-06, "loss": 0.92139214, "num_input_tokens_seen": 7222995, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 1.84375, "step": 346, "time_per_iteration": 2.500624179840088 }, { "auxiliary_loss_clip": 0.01233687, "auxiliary_loss_mlp": 0.01080794, "balance_loss_clip": 1.03635287, "balance_loss_mlp": 1.04964137, "epoch": 0.02086276867578536, "flos": 24387669480960.0, "grad_norm": 2.0655644820909558, "language_loss": 0.78656721, "learning_rate": 3.99575380727018e-06, "loss": 0.80971205, "num_input_tokens_seen": 7244625, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 1.84375, "step": 347, "time_per_iteration": 2.4814603328704834 }, { "auxiliary_loss_clip": 0.01237051, "auxiliary_loss_mlp": 0.01080866, "balance_loss_clip": 1.03516102, "balance_loss_mlp": 1.05103707, "epoch": 0.02092289192845333, "flos": 24461056892160.0, "grad_norm": 1.8886253600904628, "language_loss": 0.70518041, "learning_rate": 3.995729164762559e-06, "loss": 0.72835958, "num_input_tokens_seen": 7263255, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 1.859375, "step": 348, "time_per_iteration": 2.498539447784424 }, { "auxiliary_loss_clip": 0.01240629, "auxiliary_loss_mlp": 0.0108724, "balance_loss_clip": 1.04062903, "balance_loss_mlp": 1.04997635, "epoch": 0.0209830151811213, "flos": 17450392752000.0, "grad_norm": 9.238146016146256, "language_loss": 0.76325005, "learning_rate": 3.995704451032496e-06, "loss": 0.78652877, "num_input_tokens_seen": 7279275, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 1.90625, "step": 349, "time_per_iteration": 2.4045448303222656 }, { "auxiliary_loss_clip": 0.01223683, "auxiliary_loss_mlp": 0.01074173, "balance_loss_clip": 1.03178263, "balance_loss_mlp": 1.04728103, "epoch": 0.021043138433789268, "flos": 24572778842880.0, "grad_norm": 1.7558642765462482, "language_loss": 0.85043454, "learning_rate": 3.995679666080876e-06, "loss": 0.87341309, "num_input_tokens_seen": 7300180, "router_z_loss_clip": 0.42382812, "router_z_loss_mlp": 1.765625, "step": 350, "time_per_iteration": 2.5335793495178223 }, { "auxiliary_loss_clip": 0.01231843, "auxiliary_loss_mlp": 0.0107489, "balance_loss_clip": 1.03364384, "balance_loss_mlp": 1.05170834, "epoch": 0.021103261686457236, "flos": 24453132013440.0, "grad_norm": 7.130091306898022, "language_loss": 0.79452366, "learning_rate": 3.995654809908581e-06, "loss": 0.81759101, "num_input_tokens_seen": 7317430, "router_z_loss_clip": 0.41210938, "router_z_loss_mlp": 1.8046875, "step": 351, "time_per_iteration": 2.4338746070861816 }, { "auxiliary_loss_clip": 0.01236511, "auxiliary_loss_mlp": 0.01092286, "balance_loss_clip": 1.0449841, "balance_loss_mlp": 1.0526005, "epoch": 0.021163384939125205, "flos": 14682233917440.0, "grad_norm": 3.84086245600335, "language_loss": 0.87032181, "learning_rate": 3.9956298825165005e-06, "loss": 0.89360976, "num_input_tokens_seen": 7334875, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 1.84375, "step": 352, "time_per_iteration": 2.449504852294922 }, { "auxiliary_loss_clip": 0.01233454, "auxiliary_loss_mlp": 0.01077861, "balance_loss_clip": 1.03132141, "balance_loss_mlp": 1.05080497, "epoch": 0.021223508191793177, "flos": 24492199691520.0, "grad_norm": 1.8489810839612493, "language_loss": 0.82099515, "learning_rate": 3.995604883905522e-06, "loss": 0.84410834, "num_input_tokens_seen": 7355185, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 1.828125, "step": 353, "time_per_iteration": 2.4712696075439453 }, { "auxiliary_loss_clip": 0.01229593, "auxiliary_loss_mlp": 0.01073406, "balance_loss_clip": 1.03025222, "balance_loss_mlp": 1.0502218, "epoch": 0.021283631444461146, "flos": 24126030685440.0, "grad_norm": 1.8250425149469043, "language_loss": 0.80346203, "learning_rate": 3.995579814076539e-06, "loss": 0.82649195, "num_input_tokens_seen": 7374425, "router_z_loss_clip": 0.43164062, "router_z_loss_mlp": 1.796875, "step": 354, "time_per_iteration": 2.4686806201934814 }, { "auxiliary_loss_clip": 0.01236871, "auxiliary_loss_mlp": 0.01079676, "balance_loss_clip": 1.03323174, "balance_loss_mlp": 1.05005431, "epoch": 0.021343754697129114, "flos": 25191055843200.0, "grad_norm": 3.0584252208007134, "language_loss": 0.80488598, "learning_rate": 3.9955546730304455e-06, "loss": 0.82805151, "num_input_tokens_seen": 7394175, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 1.875, "step": 355, "time_per_iteration": 2.4575672149658203 }, { "auxiliary_loss_clip": 0.01232122, "auxiliary_loss_mlp": 0.01078482, "balance_loss_clip": 1.03416014, "balance_loss_mlp": 1.04956889, "epoch": 0.021403877949797083, "flos": 17273243180160.0, "grad_norm": 3.327945812803813, "language_loss": 0.88961899, "learning_rate": 3.995529460768139e-06, "loss": 0.91272497, "num_input_tokens_seen": 7412645, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 1.828125, "step": 356, "time_per_iteration": 2.4201087951660156 }, { "auxiliary_loss_clip": 0.01229322, "auxiliary_loss_mlp": 0.01077796, "balance_loss_clip": 1.03042173, "balance_loss_mlp": 1.04959869, "epoch": 0.021464001202465055, "flos": 30916183478400.0, "grad_norm": 2.6518396998699507, "language_loss": 0.79755867, "learning_rate": 3.995504177290519e-06, "loss": 0.82062984, "num_input_tokens_seen": 7432275, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 1.796875, "step": 357, "time_per_iteration": 2.510831356048584 }, { "auxiliary_loss_clip": 0.0123213, "auxiliary_loss_mlp": 0.01073983, "balance_loss_clip": 1.03171182, "balance_loss_mlp": 1.0482384, "epoch": 0.021524124455133024, "flos": 18185418938880.0, "grad_norm": 2.8196932538713564, "language_loss": 0.76050007, "learning_rate": 3.995478822598488e-06, "loss": 0.78356123, "num_input_tokens_seen": 7450245, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 1.84375, "step": 358, "time_per_iteration": 2.4422390460968018 }, { "auxiliary_loss_clip": 0.01229352, "auxiliary_loss_mlp": 0.01079093, "balance_loss_clip": 1.03245842, "balance_loss_mlp": 1.04627466, "epoch": 0.021584247707800992, "flos": 13805006296320.0, "grad_norm": 2.3266225841257038, "language_loss": 0.88053858, "learning_rate": 3.995453396692951e-06, "loss": 0.90362304, "num_input_tokens_seen": 7466845, "router_z_loss_clip": 0.46679688, "router_z_loss_mlp": 1.828125, "step": 359, "time_per_iteration": 2.3980555534362793 }, { "auxiliary_loss_clip": 0.01234026, "auxiliary_loss_mlp": 0.01068698, "balance_loss_clip": 1.02614033, "balance_loss_mlp": 1.05061042, "epoch": 0.02164437096046896, "flos": 23293596205440.0, "grad_norm": 4.100534030530065, "language_loss": 0.7596643, "learning_rate": 3.995427899574816e-06, "loss": 0.78269148, "num_input_tokens_seen": 7485450, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 1.828125, "step": 360, "time_per_iteration": 2.508373737335205 }, { "auxiliary_loss_clip": 0.01130479, "auxiliary_loss_mlp": 0.01015122, "balance_loss_clip": 1.00186574, "balance_loss_mlp": 1.03843045, "epoch": 0.02170449421313693, "flos": 68896237875840.0, "grad_norm": 0.8318827427639371, "language_loss": 0.64908099, "learning_rate": 3.99540233124499e-06, "loss": 0.67053699, "num_input_tokens_seen": 7553780, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.921875, "step": 361, "time_per_iteration": 3.1652159690856934 }, { "auxiliary_loss_clip": 0.0122958, "auxiliary_loss_mlp": 0.01074758, "balance_loss_clip": 1.03010178, "balance_loss_mlp": 1.04697037, "epoch": 0.0217646174658049, "flos": 25227365523840.0, "grad_norm": 3.2735007447015194, "language_loss": 0.77731925, "learning_rate": 3.995376691704389e-06, "loss": 0.80036259, "num_input_tokens_seen": 7574155, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 1.828125, "step": 362, "time_per_iteration": 2.497349262237549 }, { "auxiliary_loss_clip": 0.01232158, "auxiliary_loss_mlp": 0.01070141, "balance_loss_clip": 1.02698779, "balance_loss_mlp": 1.04921007, "epoch": 0.02182474071847287, "flos": 22892025214080.0, "grad_norm": 2.257508093659879, "language_loss": 0.9193871, "learning_rate": 3.995350980953926e-06, "loss": 0.94241005, "num_input_tokens_seen": 7592320, "router_z_loss_clip": 0.43164062, "router_z_loss_mlp": 1.828125, "step": 363, "time_per_iteration": 2.4355170726776123 }, { "auxiliary_loss_clip": 0.0122669, "auxiliary_loss_mlp": 0.01077318, "balance_loss_clip": 1.03614295, "balance_loss_mlp": 1.04583764, "epoch": 0.02188486397114084, "flos": 23657879998080.0, "grad_norm": 3.088695106876858, "language_loss": 0.89338195, "learning_rate": 3.99532519899452e-06, "loss": 0.91642201, "num_input_tokens_seen": 7611185, "router_z_loss_clip": 0.41210938, "router_z_loss_mlp": 1.8125, "step": 364, "time_per_iteration": 2.4619293212890625 }, { "auxiliary_loss_clip": 0.01231817, "auxiliary_loss_mlp": 0.01075566, "balance_loss_clip": 1.03019536, "balance_loss_mlp": 1.04998064, "epoch": 0.021944987223808807, "flos": 21542562276480.0, "grad_norm": 2.164351222240482, "language_loss": 0.78897971, "learning_rate": 3.99529934582709e-06, "loss": 0.81205356, "num_input_tokens_seen": 7631970, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 1.8203125, "step": 365, "time_per_iteration": 2.4631075859069824 }, { "auxiliary_loss_clip": 0.0123104, "auxiliary_loss_mlp": 0.01081015, "balance_loss_clip": 1.03552461, "balance_loss_mlp": 1.04984212, "epoch": 0.022005110476476776, "flos": 16069961571840.0, "grad_norm": 2.7063772335893934, "language_loss": 0.83782774, "learning_rate": 3.995273421452558e-06, "loss": 0.86094832, "num_input_tokens_seen": 7649745, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 1.8125, "step": 366, "time_per_iteration": 2.45685076713562 }, { "auxiliary_loss_clip": 0.01234878, "auxiliary_loss_mlp": 0.01070655, "balance_loss_clip": 1.02695298, "balance_loss_mlp": 1.05081034, "epoch": 0.022065233729144748, "flos": 21432655716480.0, "grad_norm": 2.1036187774214907, "language_loss": 0.86579663, "learning_rate": 3.995247425871851e-06, "loss": 0.888852, "num_input_tokens_seen": 7668830, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 1.84375, "step": 367, "time_per_iteration": 2.4273951053619385 }, { "auxiliary_loss_clip": 0.01234498, "auxiliary_loss_mlp": 0.01090663, "balance_loss_clip": 1.04426718, "balance_loss_mlp": 1.04885268, "epoch": 0.022125356981812717, "flos": 21542632099200.0, "grad_norm": 2.387828187297824, "language_loss": 0.84244931, "learning_rate": 3.995221359085895e-06, "loss": 0.86570096, "num_input_tokens_seen": 7687240, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 1.859375, "step": 368, "time_per_iteration": 2.4499919414520264 }, { "auxiliary_loss_clip": 0.01234171, "auxiliary_loss_mlp": 0.01067317, "balance_loss_clip": 1.02347231, "balance_loss_mlp": 1.04798639, "epoch": 0.022185480234480685, "flos": 20703110613120.0, "grad_norm": 2.3411944361606665, "language_loss": 0.74964315, "learning_rate": 3.995195221095621e-06, "loss": 0.77265799, "num_input_tokens_seen": 7704440, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 1.859375, "step": 369, "time_per_iteration": 2.4479618072509766 }, { "auxiliary_loss_clip": 0.01229101, "auxiliary_loss_mlp": 0.0107853, "balance_loss_clip": 1.03704512, "balance_loss_mlp": 1.04931593, "epoch": 0.022245603487148654, "flos": 25191998449920.0, "grad_norm": 2.161917099101355, "language_loss": 0.82162476, "learning_rate": 3.995169011901963e-06, "loss": 0.84470105, "num_input_tokens_seen": 7727160, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 1.796875, "step": 370, "time_per_iteration": 3.883336305618286 }, { "auxiliary_loss_clip": 0.01231991, "auxiliary_loss_mlp": 0.0108399, "balance_loss_clip": 1.04186118, "balance_loss_mlp": 1.05016744, "epoch": 0.022305726739816623, "flos": 21394914670080.0, "grad_norm": 2.3533588348566785, "language_loss": 0.81352019, "learning_rate": 3.995142731505854e-06, "loss": 0.83667994, "num_input_tokens_seen": 7747730, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 1.8203125, "step": 371, "time_per_iteration": 2.4541006088256836 }, { "auxiliary_loss_clip": 0.01233079, "auxiliary_loss_mlp": 0.0108394, "balance_loss_clip": 1.03885484, "balance_loss_mlp": 1.05017495, "epoch": 0.022365849992484595, "flos": 22491047715840.0, "grad_norm": 2.63447056724904, "language_loss": 0.83084446, "learning_rate": 3.995116379908234e-06, "loss": 0.85401469, "num_input_tokens_seen": 7766765, "router_z_loss_clip": 0.45117188, "router_z_loss_mlp": 1.828125, "step": 372, "time_per_iteration": 5.315749168395996 }, { "auxiliary_loss_clip": 0.01225089, "auxiliary_loss_mlp": 0.01074048, "balance_loss_clip": 1.02908194, "balance_loss_mlp": 1.04825759, "epoch": 0.022425973245152563, "flos": 17855664347520.0, "grad_norm": 5.51553422257015, "language_loss": 0.78102767, "learning_rate": 3.995089957110041e-06, "loss": 0.80401897, "num_input_tokens_seen": 7784010, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 1.765625, "step": 373, "time_per_iteration": 3.8593719005584717 }, { "auxiliary_loss_clip": 0.01229883, "auxiliary_loss_mlp": 0.01077793, "balance_loss_clip": 1.03256512, "balance_loss_mlp": 1.04971766, "epoch": 0.022486096497820532, "flos": 15482233877760.0, "grad_norm": 2.5033796908762147, "language_loss": 0.76996267, "learning_rate": 3.995063463112221e-06, "loss": 0.79303944, "num_input_tokens_seen": 7801305, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 1.8046875, "step": 374, "time_per_iteration": 2.419429302215576 }, { "auxiliary_loss_clip": 0.01228989, "auxiliary_loss_mlp": 0.01072966, "balance_loss_clip": 1.0256635, "balance_loss_mlp": 1.0454855, "epoch": 0.0225462197504885, "flos": 27782868067200.0, "grad_norm": 1.936964279705477, "language_loss": 0.85860884, "learning_rate": 3.995036897915717e-06, "loss": 0.88162833, "num_input_tokens_seen": 7823965, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 1.8359375, "step": 375, "time_per_iteration": 2.4995079040527344 }, { "auxiliary_loss_clip": 0.0123226, "auxiliary_loss_mlp": 0.01084578, "balance_loss_clip": 1.03849185, "balance_loss_mlp": 1.05112147, "epoch": 0.02260634300315647, "flos": 19974438293760.0, "grad_norm": 2.2991912097388605, "language_loss": 0.88661456, "learning_rate": 3.995010261521478e-06, "loss": 0.90978289, "num_input_tokens_seen": 7842115, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 1.8125, "step": 376, "time_per_iteration": 2.418308973312378 }, { "auxiliary_loss_clip": 0.0122968, "auxiliary_loss_mlp": 0.01071277, "balance_loss_clip": 1.02774143, "balance_loss_mlp": 1.04656196, "epoch": 0.02266646625582444, "flos": 16027437669120.0, "grad_norm": 2.4116974172152337, "language_loss": 0.74843597, "learning_rate": 3.9949835539304545e-06, "loss": 0.77144551, "num_input_tokens_seen": 7857830, "router_z_loss_clip": 0.43554688, "router_z_loss_mlp": 1.828125, "step": 377, "time_per_iteration": 2.4013772010803223 }, { "auxiliary_loss_clip": 0.01224565, "auxiliary_loss_mlp": 0.01075962, "balance_loss_clip": 1.03197408, "balance_loss_mlp": 1.04923904, "epoch": 0.02272658950849241, "flos": 20403800593920.0, "grad_norm": 2.256167411950087, "language_loss": 0.9871459, "learning_rate": 3.9949567751436e-06, "loss": 1.01015115, "num_input_tokens_seen": 7875840, "router_z_loss_clip": 0.43945312, "router_z_loss_mlp": 1.75, "step": 378, "time_per_iteration": 2.4311933517456055 }, { "auxiliary_loss_clip": 0.01114269, "auxiliary_loss_mlp": 0.010195, "balance_loss_clip": 1.0085324, "balance_loss_mlp": 1.02366507, "epoch": 0.02278671276116038, "flos": 69843990176640.0, "grad_norm": 0.9572279775789706, "language_loss": 0.75515658, "learning_rate": 3.99492992516187e-06, "loss": 0.77649432, "num_input_tokens_seen": 7940190, "router_z_loss_clip": 0.10986328, "router_z_loss_mlp": 0.90625, "step": 379, "time_per_iteration": 3.109609842300415 }, { "auxiliary_loss_clip": 0.01233634, "auxiliary_loss_mlp": 0.01072328, "balance_loss_clip": 1.02812481, "balance_loss_mlp": 1.04682207, "epoch": 0.022846836013828347, "flos": 38507243927040.0, "grad_norm": 2.251694170849771, "language_loss": 0.78342873, "learning_rate": 3.994903003986222e-06, "loss": 0.80648834, "num_input_tokens_seen": 7960840, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 1.8671875, "step": 380, "time_per_iteration": 2.5509915351867676 }, { "auxiliary_loss_clip": 0.01224327, "auxiliary_loss_mlp": 0.01077174, "balance_loss_clip": 1.03294706, "balance_loss_mlp": 1.0472827, "epoch": 0.02290695926649632, "flos": 20958430452480.0, "grad_norm": 2.3818750916319096, "language_loss": 0.95732051, "learning_rate": 3.9948760116176174e-06, "loss": 0.98033547, "num_input_tokens_seen": 7975500, "router_z_loss_clip": 0.44335938, "router_z_loss_mlp": 1.7734375, "step": 381, "time_per_iteration": 2.4247195720672607 }, { "auxiliary_loss_clip": 0.01233709, "auxiliary_loss_mlp": 0.01091844, "balance_loss_clip": 1.04683065, "balance_loss_mlp": 1.0478375, "epoch": 0.022967082519164288, "flos": 24021325918080.0, "grad_norm": 5.971767421602091, "language_loss": 0.87281406, "learning_rate": 3.994848948057019e-06, "loss": 0.89606953, "num_input_tokens_seen": 7993880, "router_z_loss_clip": 0.45117188, "router_z_loss_mlp": 1.859375, "step": 382, "time_per_iteration": 2.4535419940948486 }, { "auxiliary_loss_clip": 0.01232721, "auxiliary_loss_mlp": 0.01079831, "balance_loss_clip": 1.03834581, "balance_loss_mlp": 1.04840291, "epoch": 0.023027205771832256, "flos": 20996066764800.0, "grad_norm": 2.0497498489302672, "language_loss": 0.84413087, "learning_rate": 3.994821813305394e-06, "loss": 0.8672564, "num_input_tokens_seen": 8012730, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 1.84375, "step": 383, "time_per_iteration": 2.427741765975952 }, { "auxiliary_loss_clip": 0.01224827, "auxiliary_loss_mlp": 0.01096017, "balance_loss_clip": 1.05219579, "balance_loss_mlp": 1.04934549, "epoch": 0.023087329024500225, "flos": 21359757064320.0, "grad_norm": 2.510335613890191, "language_loss": 0.82757276, "learning_rate": 3.99479460736371e-06, "loss": 0.8507812, "num_input_tokens_seen": 8031275, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 1.7578125, "step": 384, "time_per_iteration": 2.416593074798584 }, { "auxiliary_loss_clip": 0.01226601, "auxiliary_loss_mlp": 0.0107713, "balance_loss_clip": 1.03562117, "balance_loss_mlp": 1.05061913, "epoch": 0.023147452277168194, "flos": 21871339349760.0, "grad_norm": 2.094474581463722, "language_loss": 0.88727117, "learning_rate": 3.994767330232937e-06, "loss": 0.91030848, "num_input_tokens_seen": 8051600, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 1.7578125, "step": 385, "time_per_iteration": 2.4557037353515625 }, { "auxiliary_loss_clip": 0.01231744, "auxiliary_loss_mlp": 0.01084369, "balance_loss_clip": 1.04207373, "balance_loss_mlp": 1.05157328, "epoch": 0.023207575529836166, "flos": 18915697180800.0, "grad_norm": 2.4968411771950567, "language_loss": 0.69599569, "learning_rate": 3.994739981914049e-06, "loss": 0.71915674, "num_input_tokens_seen": 8070600, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 1.796875, "step": 386, "time_per_iteration": 2.4219396114349365 }, { "auxiliary_loss_clip": 0.01227764, "auxiliary_loss_mlp": 0.01078979, "balance_loss_clip": 1.03563499, "balance_loss_mlp": 1.05063343, "epoch": 0.023267698782504134, "flos": 25044839602560.0, "grad_norm": 9.04059514600862, "language_loss": 0.87687516, "learning_rate": 3.994712562408022e-06, "loss": 0.89994264, "num_input_tokens_seen": 8090680, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 1.765625, "step": 387, "time_per_iteration": 2.4649181365966797 }, { "auxiliary_loss_clip": 0.01228789, "auxiliary_loss_mlp": 0.01068427, "balance_loss_clip": 1.02572608, "balance_loss_mlp": 1.05026042, "epoch": 0.023327822035172103, "flos": 28877883949440.0, "grad_norm": 1.990605289011327, "language_loss": 0.83328348, "learning_rate": 3.994685071715835e-06, "loss": 0.85625565, "num_input_tokens_seen": 8114610, "router_z_loss_clip": 0.42773438, "router_z_loss_mlp": 1.7890625, "step": 388, "time_per_iteration": 2.5158395767211914 }, { "auxiliary_loss_clip": 0.01225898, "auxiliary_loss_mlp": 0.01072797, "balance_loss_clip": 1.03352904, "balance_loss_mlp": 1.04716516, "epoch": 0.02338794528784007, "flos": 27120426330240.0, "grad_norm": 2.7565455513494936, "language_loss": 0.9320004, "learning_rate": 3.9946575098384686e-06, "loss": 0.95498735, "num_input_tokens_seen": 8133975, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 1.7890625, "step": 389, "time_per_iteration": 2.494342803955078 }, { "auxiliary_loss_clip": 0.0122056, "auxiliary_loss_mlp": 0.01076788, "balance_loss_clip": 1.0343492, "balance_loss_mlp": 1.04805517, "epoch": 0.02344806854050804, "flos": 21321352702080.0, "grad_norm": 3.0699235696824085, "language_loss": 0.87314248, "learning_rate": 3.9946298767769065e-06, "loss": 0.89611602, "num_input_tokens_seen": 8153570, "router_z_loss_clip": 0.42382812, "router_z_loss_mlp": 1.7265625, "step": 390, "time_per_iteration": 2.453216552734375 }, { "auxiliary_loss_clip": 0.01223692, "auxiliary_loss_mlp": 0.01069994, "balance_loss_clip": 1.0310601, "balance_loss_mlp": 1.04871178, "epoch": 0.023508191793176012, "flos": 24788856447360.0, "grad_norm": 4.83313340351558, "language_loss": 0.88527739, "learning_rate": 3.994602172532135e-06, "loss": 0.90821421, "num_input_tokens_seen": 8170075, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 1.75, "step": 391, "time_per_iteration": 2.455122947692871 }, { "auxiliary_loss_clip": 0.01220029, "auxiliary_loss_mlp": 0.01064478, "balance_loss_clip": 1.02511477, "balance_loss_mlp": 1.04577327, "epoch": 0.02356831504584398, "flos": 25994162914560.0, "grad_norm": 3.6976594815005357, "language_loss": 0.86070114, "learning_rate": 3.994574397105143e-06, "loss": 0.88354623, "num_input_tokens_seen": 8190420, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 1.7421875, "step": 392, "time_per_iteration": 2.462695837020874 }, { "auxiliary_loss_clip": 0.01219756, "auxiliary_loss_mlp": 0.01065758, "balance_loss_clip": 1.02577543, "balance_loss_mlp": 1.04501486, "epoch": 0.02362843829851195, "flos": 19061459573760.0, "grad_norm": 1.9887120025248302, "language_loss": 0.88842404, "learning_rate": 3.994546550496921e-06, "loss": 0.91127914, "num_input_tokens_seen": 8208790, "router_z_loss_clip": 0.40039062, "router_z_loss_mlp": 1.75, "step": 393, "time_per_iteration": 2.4153332710266113 }, { "auxiliary_loss_clip": 0.01226983, "auxiliary_loss_mlp": 0.01083042, "balance_loss_clip": 1.03962576, "balance_loss_mlp": 1.05139089, "epoch": 0.023688561551179918, "flos": 16070101217280.0, "grad_norm": 3.2261350159623565, "language_loss": 0.81036854, "learning_rate": 3.994518632708464e-06, "loss": 0.83346879, "num_input_tokens_seen": 8226885, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 1.75, "step": 394, "time_per_iteration": 2.4195621013641357 }, { "auxiliary_loss_clip": 0.01222919, "auxiliary_loss_mlp": 0.0107631, "balance_loss_clip": 1.03420556, "balance_loss_mlp": 1.04789519, "epoch": 0.023748684803847887, "flos": 21723342629760.0, "grad_norm": 1.9563544568378761, "language_loss": 0.85760427, "learning_rate": 3.994490643740766e-06, "loss": 0.88059652, "num_input_tokens_seen": 8246825, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 1.75, "step": 395, "time_per_iteration": 2.447772741317749 }, { "auxiliary_loss_clip": 0.01105912, "auxiliary_loss_mlp": 0.01019347, "balance_loss_clip": 1.00837958, "balance_loss_mlp": 1.0205375, "epoch": 0.02380880805651586, "flos": 61923175136640.0, "grad_norm": 0.91434065681227, "language_loss": 0.63803995, "learning_rate": 3.994462583594828e-06, "loss": 0.65929258, "num_input_tokens_seen": 8302835, "router_z_loss_clip": 0.10986328, "router_z_loss_mlp": 0.8515625, "step": 396, "time_per_iteration": 2.896740198135376 }, { "auxiliary_loss_clip": 0.01220118, "auxiliary_loss_mlp": 0.01063434, "balance_loss_clip": 1.02218795, "balance_loss_mlp": 1.04511356, "epoch": 0.023868931309183827, "flos": 20265299763840.0, "grad_norm": 2.704976210508278, "language_loss": 0.83204758, "learning_rate": 3.994434452271651e-06, "loss": 0.85488307, "num_input_tokens_seen": 8320745, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 1.75, "step": 397, "time_per_iteration": 2.4413938522338867 }, { "auxiliary_loss_clip": 0.0122405, "auxiliary_loss_mlp": 0.01069104, "balance_loss_clip": 1.02654648, "balance_loss_mlp": 1.04862189, "epoch": 0.023929054561851796, "flos": 21138128553600.0, "grad_norm": 2.4026621418980687, "language_loss": 0.84061825, "learning_rate": 3.994406249772239e-06, "loss": 0.86354977, "num_input_tokens_seen": 8339540, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 1.75, "step": 398, "time_per_iteration": 2.4305572509765625 }, { "auxiliary_loss_clip": 0.01224106, "auxiliary_loss_mlp": 0.01077038, "balance_loss_clip": 1.03304994, "balance_loss_mlp": 1.04774022, "epoch": 0.023989177814519765, "flos": 13697683176960.0, "grad_norm": 3.4160456873041443, "language_loss": 0.8576498, "learning_rate": 3.994377976097598e-06, "loss": 0.88066125, "num_input_tokens_seen": 8354890, "router_z_loss_clip": 0.43945312, "router_z_loss_mlp": 1.765625, "step": 399, "time_per_iteration": 2.400832176208496 }, { "auxiliary_loss_clip": 0.01219711, "auxiliary_loss_mlp": 0.01079695, "balance_loss_clip": 1.03799558, "balance_loss_mlp": 1.04665935, "epoch": 0.024049301067187733, "flos": 26320845306240.0, "grad_norm": 2.980519631223286, "language_loss": 0.85427976, "learning_rate": 3.9943496312487365e-06, "loss": 0.87727386, "num_input_tokens_seen": 8375845, "router_z_loss_clip": 0.41601562, "router_z_loss_mlp": 1.734375, "step": 400, "time_per_iteration": 2.48085618019104 }, { "auxiliary_loss_clip": 0.01224102, "auxiliary_loss_mlp": 0.01069693, "balance_loss_clip": 1.02756393, "balance_loss_mlp": 1.05105746, "epoch": 0.024109424319855705, "flos": 24424293363840.0, "grad_norm": 1.8415507104795583, "language_loss": 0.7897774, "learning_rate": 3.994321215226667e-06, "loss": 0.81271529, "num_input_tokens_seen": 8395240, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 1.734375, "step": 401, "time_per_iteration": 2.4944117069244385 }, { "auxiliary_loss_clip": 0.01219562, "auxiliary_loss_mlp": 0.01078446, "balance_loss_clip": 1.03874898, "balance_loss_mlp": 1.04854548, "epoch": 0.024169547572523674, "flos": 29603169866880.0, "grad_norm": 2.2638273233506365, "language_loss": 0.78047067, "learning_rate": 3.994292728032404e-06, "loss": 0.80345076, "num_input_tokens_seen": 8416950, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 1.703125, "step": 402, "time_per_iteration": 2.5082874298095703 }, { "auxiliary_loss_clip": 0.0122181, "auxiliary_loss_mlp": 0.01069983, "balance_loss_clip": 1.02740133, "balance_loss_mlp": 1.04404521, "epoch": 0.024229670825191642, "flos": 22600360782720.0, "grad_norm": 2.746171059960839, "language_loss": 0.94473672, "learning_rate": 3.994264169666963e-06, "loss": 0.96765459, "num_input_tokens_seen": 8433660, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 1.7734375, "step": 403, "time_per_iteration": 2.4508306980133057 }, { "auxiliary_loss_clip": 0.01228746, "auxiliary_loss_mlp": 0.01071185, "balance_loss_clip": 1.0266248, "balance_loss_mlp": 1.04774153, "epoch": 0.02428979407785961, "flos": 18149283815040.0, "grad_norm": 2.778395112563993, "language_loss": 0.9934364, "learning_rate": 3.994235540131364e-06, "loss": 1.01643562, "num_input_tokens_seen": 8450180, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 1.8125, "step": 404, "time_per_iteration": 2.392777919769287 }, { "auxiliary_loss_clip": 0.01224341, "auxiliary_loss_mlp": 0.01079139, "balance_loss_clip": 1.03729606, "balance_loss_mlp": 1.04792476, "epoch": 0.024349917330527583, "flos": 15304071876480.0, "grad_norm": 3.1709684895581076, "language_loss": 0.87440234, "learning_rate": 3.994206839426627e-06, "loss": 0.8974371, "num_input_tokens_seen": 8467775, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 1.765625, "step": 405, "time_per_iteration": 2.414794445037842 }, { "auxiliary_loss_clip": 0.01232461, "auxiliary_loss_mlp": 0.01072855, "balance_loss_clip": 1.02982068, "balance_loss_mlp": 1.05070579, "epoch": 0.024410040583195552, "flos": 20772937065600.0, "grad_norm": 3.0795526101990034, "language_loss": 0.93019068, "learning_rate": 3.994178067553779e-06, "loss": 0.95324385, "num_input_tokens_seen": 8486765, "router_z_loss_clip": 0.43164062, "router_z_loss_mlp": 1.8203125, "step": 406, "time_per_iteration": 2.4340293407440186 }, { "auxiliary_loss_clip": 0.01225436, "auxiliary_loss_mlp": 0.01079018, "balance_loss_clip": 1.03405201, "balance_loss_mlp": 1.04839635, "epoch": 0.02447016383586352, "flos": 21797777381760.0, "grad_norm": 4.538106572159414, "language_loss": 0.86687589, "learning_rate": 3.994149224513846e-06, "loss": 0.88992041, "num_input_tokens_seen": 8506515, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 1.765625, "step": 407, "time_per_iteration": 2.441693067550659 }, { "auxiliary_loss_clip": 0.01223369, "auxiliary_loss_mlp": 0.01074858, "balance_loss_clip": 1.03272939, "balance_loss_mlp": 1.04847205, "epoch": 0.02453028708853149, "flos": 33946714247040.0, "grad_norm": 2.046892011184857, "language_loss": 0.73118854, "learning_rate": 3.994120310307856e-06, "loss": 0.75417078, "num_input_tokens_seen": 8528035, "router_z_loss_clip": 0.41992188, "router_z_loss_mlp": 1.75, "step": 408, "time_per_iteration": 2.5407135486602783 }, { "auxiliary_loss_clip": 0.01228314, "auxiliary_loss_mlp": 0.01074045, "balance_loss_clip": 1.03263187, "balance_loss_mlp": 1.05185044, "epoch": 0.024590410341199458, "flos": 21792086830080.0, "grad_norm": 2.7996715546795694, "language_loss": 0.92269748, "learning_rate": 3.994091324936841e-06, "loss": 0.94572109, "num_input_tokens_seen": 8546455, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 1.765625, "step": 409, "time_per_iteration": 3.8719043731689453 }, { "auxiliary_loss_clip": 0.01225605, "auxiliary_loss_mlp": 0.01066815, "balance_loss_clip": 1.02556872, "balance_loss_mlp": 1.0501914, "epoch": 0.02465053359386743, "flos": 35113371972480.0, "grad_norm": 2.2476422238018245, "language_loss": 0.81878775, "learning_rate": 3.994062268401836e-06, "loss": 0.84171194, "num_input_tokens_seen": 8568450, "router_z_loss_clip": 0.41210938, "router_z_loss_mlp": 1.7578125, "step": 410, "time_per_iteration": 2.5387113094329834 }, { "auxiliary_loss_clip": 0.01226802, "auxiliary_loss_mlp": 0.01076319, "balance_loss_clip": 1.03333235, "balance_loss_mlp": 1.04942226, "epoch": 0.0247106568465354, "flos": 27450250744320.0, "grad_norm": 2.654348941693858, "language_loss": 0.77659327, "learning_rate": 3.994033140703878e-06, "loss": 0.7996245, "num_input_tokens_seen": 8589340, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 1.7734375, "step": 411, "time_per_iteration": 3.9723191261291504 }, { "auxiliary_loss_clip": 0.0122833, "auxiliary_loss_mlp": 0.0108574, "balance_loss_clip": 1.04299116, "balance_loss_mlp": 1.04967451, "epoch": 0.024770780099203367, "flos": 20702761499520.0, "grad_norm": 2.2231464753261045, "language_loss": 0.86391199, "learning_rate": 3.994003941844007e-06, "loss": 0.88705271, "num_input_tokens_seen": 8607150, "router_z_loss_clip": 0.42773438, "router_z_loss_mlp": 1.7890625, "step": 412, "time_per_iteration": 3.839991331100464 }, { "auxiliary_loss_clip": 0.01101658, "auxiliary_loss_mlp": 0.01014881, "balance_loss_clip": 1.00386584, "balance_loss_mlp": 1.02291083, "epoch": 0.024830903351871336, "flos": 69548625141120.0, "grad_norm": 0.8306108238647311, "language_loss": 0.5848062, "learning_rate": 3.993974671823265e-06, "loss": 0.60597157, "num_input_tokens_seen": 8669865, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.7890625, "step": 413, "time_per_iteration": 4.496323585510254 }, { "auxiliary_loss_clip": 0.01224728, "auxiliary_loss_mlp": 0.0107864, "balance_loss_clip": 1.03374577, "balance_loss_mlp": 1.04808426, "epoch": 0.024891026604539304, "flos": 32269102640640.0, "grad_norm": 2.8425881831129383, "language_loss": 0.80029666, "learning_rate": 3.9939453306426955e-06, "loss": 0.8233304, "num_input_tokens_seen": 8690235, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 1.765625, "step": 414, "time_per_iteration": 2.5428895950317383 }, { "auxiliary_loss_clip": 0.01218806, "auxiliary_loss_mlp": 0.01077296, "balance_loss_clip": 1.03511941, "balance_loss_mlp": 1.04687631, "epoch": 0.024951149857207276, "flos": 18839377215360.0, "grad_norm": 2.7484104693387423, "language_loss": 0.7967658, "learning_rate": 3.9939159183033466e-06, "loss": 0.81972682, "num_input_tokens_seen": 8706295, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 1.71875, "step": 415, "time_per_iteration": 2.4227468967437744 }, { "auxiliary_loss_clip": 0.01230682, "auxiliary_loss_mlp": 0.01078942, "balance_loss_clip": 1.03686094, "balance_loss_mlp": 1.05102003, "epoch": 0.025011273109875245, "flos": 15376307212800.0, "grad_norm": 3.0735378724622233, "language_loss": 0.95692146, "learning_rate": 3.9938864348062675e-06, "loss": 0.98001772, "num_input_tokens_seen": 8724200, "router_z_loss_clip": 0.41992188, "router_z_loss_mlp": 1.796875, "step": 416, "time_per_iteration": 2.4133734703063965 }, { "auxiliary_loss_clip": 0.01223109, "auxiliary_loss_mlp": 0.01067797, "balance_loss_clip": 1.02349901, "balance_loss_mlp": 1.04716539, "epoch": 0.025071396362543213, "flos": 18914545105920.0, "grad_norm": 2.036718056261979, "language_loss": 0.77308404, "learning_rate": 3.993856880152509e-06, "loss": 0.79599309, "num_input_tokens_seen": 8744170, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 1.7578125, "step": 417, "time_per_iteration": 2.454113245010376 }, { "auxiliary_loss_clip": 0.01220784, "auxiliary_loss_mlp": 0.01082056, "balance_loss_clip": 1.03978431, "balance_loss_mlp": 1.05053282, "epoch": 0.025131519615211182, "flos": 25336783324800.0, "grad_norm": 1.654779187260334, "language_loss": 0.76904714, "learning_rate": 3.9938272543431286e-06, "loss": 0.79207551, "num_input_tokens_seen": 8765120, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 1.703125, "step": 418, "time_per_iteration": 2.50669002532959 }, { "auxiliary_loss_clip": 0.0122798, "auxiliary_loss_mlp": 0.01067283, "balance_loss_clip": 1.02725244, "balance_loss_mlp": 1.05104196, "epoch": 0.02519164286787915, "flos": 18952146506880.0, "grad_norm": 4.729071356260811, "language_loss": 0.81378472, "learning_rate": 3.993797557379182e-06, "loss": 0.83673733, "num_input_tokens_seen": 8783500, "router_z_loss_clip": 0.40039062, "router_z_loss_mlp": 1.765625, "step": 419, "time_per_iteration": 2.421539545059204 }, { "auxiliary_loss_clip": 0.01219551, "auxiliary_loss_mlp": 0.01074075, "balance_loss_clip": 1.0309689, "balance_loss_mlp": 1.04970813, "epoch": 0.025251766120547123, "flos": 17420122736640.0, "grad_norm": 2.2544890987226625, "language_loss": 0.7359246, "learning_rate": 3.9937677892617295e-06, "loss": 0.75886083, "num_input_tokens_seen": 8801175, "router_z_loss_clip": 0.43164062, "router_z_loss_mlp": 1.6953125, "step": 420, "time_per_iteration": 2.417229175567627 }, { "auxiliary_loss_clip": 0.01220992, "auxiliary_loss_mlp": 0.01075999, "balance_loss_clip": 1.03506291, "balance_loss_mlp": 1.04477525, "epoch": 0.02531188937321509, "flos": 25044281020800.0, "grad_norm": 1.9282394845206294, "language_loss": 0.78481078, "learning_rate": 3.993737949991833e-06, "loss": 0.80778074, "num_input_tokens_seen": 8820215, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 1.7578125, "step": 421, "time_per_iteration": 2.5036356449127197 }, { "auxiliary_loss_clip": 0.01219951, "auxiliary_loss_mlp": 0.01077345, "balance_loss_clip": 1.0353117, "balance_loss_mlp": 1.05088842, "epoch": 0.02537201262588306, "flos": 30590897541120.0, "grad_norm": 2.08196997858772, "language_loss": 0.81678551, "learning_rate": 3.993708039570557e-06, "loss": 0.8397584, "num_input_tokens_seen": 8839660, "router_z_loss_clip": 0.41992188, "router_z_loss_mlp": 1.6875, "step": 422, "time_per_iteration": 2.503122568130493 }, { "auxiliary_loss_clip": 0.01221051, "auxiliary_loss_mlp": 0.01069051, "balance_loss_clip": 1.03059411, "balance_loss_mlp": 1.04643059, "epoch": 0.02543213587855103, "flos": 26064233746560.0, "grad_norm": 1.8956315955091694, "language_loss": 0.83241171, "learning_rate": 3.99367805799897e-06, "loss": 0.85531271, "num_input_tokens_seen": 8859280, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 1.75, "step": 423, "time_per_iteration": 2.467223882675171 }, { "auxiliary_loss_clip": 0.01215465, "auxiliary_loss_mlp": 0.01064725, "balance_loss_clip": 1.02540994, "balance_loss_mlp": 1.04517615, "epoch": 0.025492259131218997, "flos": 36021498013440.0, "grad_norm": 2.03259954828413, "language_loss": 0.74068058, "learning_rate": 3.993648005278142e-06, "loss": 0.76348245, "num_input_tokens_seen": 8880560, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 1.703125, "step": 424, "time_per_iteration": 2.521575689315796 }, { "auxiliary_loss_clip": 0.01225442, "auxiliary_loss_mlp": 0.01083442, "balance_loss_clip": 1.04109931, "balance_loss_mlp": 1.04883325, "epoch": 0.02555238238388697, "flos": 18587059752960.0, "grad_norm": 2.815339780265551, "language_loss": 0.8265295, "learning_rate": 3.993617881409143e-06, "loss": 0.84961832, "num_input_tokens_seen": 8899155, "router_z_loss_clip": 0.42382812, "router_z_loss_mlp": 1.765625, "step": 425, "time_per_iteration": 2.4312291145324707 }, { "auxiliary_loss_clip": 0.01222589, "auxiliary_loss_mlp": 0.01077014, "balance_loss_clip": 1.03600645, "balance_loss_mlp": 1.0447588, "epoch": 0.025612505636554938, "flos": 24242046733440.0, "grad_norm": 3.8650408966719287, "language_loss": 0.85035753, "learning_rate": 3.993587686393052e-06, "loss": 0.87335354, "num_input_tokens_seen": 8917890, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 1.78125, "step": 426, "time_per_iteration": 2.4371635913848877 }, { "auxiliary_loss_clip": 0.01098027, "auxiliary_loss_mlp": 0.01017479, "balance_loss_clip": 1.00841868, "balance_loss_mlp": 1.0204736, "epoch": 0.025672628889222907, "flos": 60583661936640.0, "grad_norm": 0.8871951760081042, "language_loss": 0.57136494, "learning_rate": 3.993557420230944e-06, "loss": 0.59252, "num_input_tokens_seen": 8978260, "router_z_loss_clip": 0.09082031, "router_z_loss_mlp": 0.7734375, "step": 427, "time_per_iteration": 3.173518657684326 }, { "auxiliary_loss_clip": 0.01222285, "auxiliary_loss_mlp": 0.01077646, "balance_loss_clip": 1.03778279, "balance_loss_mlp": 1.04927766, "epoch": 0.025732752141890875, "flos": 19572238897920.0, "grad_norm": 3.820220230684572, "language_loss": 0.87785196, "learning_rate": 3.9935270829239e-06, "loss": 0.90085125, "num_input_tokens_seen": 8994460, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 1.734375, "step": 428, "time_per_iteration": 2.404013156890869 }, { "auxiliary_loss_clip": 0.01218422, "auxiliary_loss_mlp": 0.01069172, "balance_loss_clip": 1.02976179, "balance_loss_mlp": 1.04609978, "epoch": 0.025792875394558847, "flos": 31282945977600.0, "grad_norm": 1.7895895121794416, "language_loss": 0.85488737, "learning_rate": 3.993496674473002e-06, "loss": 0.87776333, "num_input_tokens_seen": 9016670, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 1.7265625, "step": 429, "time_per_iteration": 2.5000152587890625 }, { "auxiliary_loss_clip": 0.01218331, "auxiliary_loss_mlp": 0.01076159, "balance_loss_clip": 1.0330286, "balance_loss_mlp": 1.04326129, "epoch": 0.025852998647226816, "flos": 32378241150720.0, "grad_norm": 3.5761364735444694, "language_loss": 0.88163298, "learning_rate": 3.993466194879335e-06, "loss": 0.90457785, "num_input_tokens_seen": 9039720, "router_z_loss_clip": 0.43164062, "router_z_loss_mlp": 1.75, "step": 430, "time_per_iteration": 2.5054399967193604 }, { "auxiliary_loss_clip": 0.01221007, "auxiliary_loss_mlp": 0.01064797, "balance_loss_clip": 1.02712727, "balance_loss_mlp": 1.05101562, "epoch": 0.025913121899894784, "flos": 20192261466240.0, "grad_norm": 2.0882154604930507, "language_loss": 0.83566093, "learning_rate": 3.993435644143989e-06, "loss": 0.85851896, "num_input_tokens_seen": 9059850, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 1.703125, "step": 431, "time_per_iteration": 2.413207769393921 }, { "auxiliary_loss_clip": 0.01219247, "auxiliary_loss_mlp": 0.01062272, "balance_loss_clip": 1.02312338, "balance_loss_mlp": 1.04578114, "epoch": 0.025973245152562753, "flos": 14719556027520.0, "grad_norm": 4.11964093237474, "language_loss": 0.86177897, "learning_rate": 3.993405022268051e-06, "loss": 0.8845942, "num_input_tokens_seen": 9077590, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 1.734375, "step": 432, "time_per_iteration": 2.3806257247924805 }, { "auxiliary_loss_clip": 0.01222087, "auxiliary_loss_mlp": 0.01061348, "balance_loss_clip": 1.02436972, "balance_loss_mlp": 1.04628515, "epoch": 0.02603336840523072, "flos": 30991665571200.0, "grad_norm": 2.6666100728953968, "language_loss": 0.75972843, "learning_rate": 3.993374329252616e-06, "loss": 0.78256285, "num_input_tokens_seen": 9099880, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 1.7578125, "step": 433, "time_per_iteration": 2.51416277885437 }, { "auxiliary_loss_clip": 0.01221008, "auxiliary_loss_mlp": 0.01080675, "balance_loss_clip": 1.0364722, "balance_loss_mlp": 1.04617822, "epoch": 0.026093491657898694, "flos": 17673347894400.0, "grad_norm": 1.959187456798102, "language_loss": 0.89468384, "learning_rate": 3.993343565098778e-06, "loss": 0.91770065, "num_input_tokens_seen": 9118620, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 1.7421875, "step": 434, "time_per_iteration": 2.4000027179718018 }, { "auxiliary_loss_clip": 0.01221612, "auxiliary_loss_mlp": 0.01081696, "balance_loss_clip": 1.04037833, "balance_loss_mlp": 1.05102932, "epoch": 0.026153614910566662, "flos": 17856921156480.0, "grad_norm": 2.096397488032798, "language_loss": 0.79408079, "learning_rate": 3.993312729807637e-06, "loss": 0.81711388, "num_input_tokens_seen": 9135655, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 1.703125, "step": 435, "time_per_iteration": 2.4256155490875244 }, { "auxiliary_loss_clip": 0.01217634, "auxiliary_loss_mlp": 0.01078944, "balance_loss_clip": 1.03753078, "balance_loss_mlp": 1.04609227, "epoch": 0.02621373816323463, "flos": 20010084658560.0, "grad_norm": 2.5845390045116687, "language_loss": 0.86184919, "learning_rate": 3.993281823380292e-06, "loss": 0.88481498, "num_input_tokens_seen": 9153520, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 1.71875, "step": 436, "time_per_iteration": 2.4278886318206787 }, { "auxiliary_loss_clip": 0.012181, "auxiliary_loss_mlp": 0.01087453, "balance_loss_clip": 1.04573023, "balance_loss_mlp": 1.0470084, "epoch": 0.0262738614159026, "flos": 19280190441600.0, "grad_norm": 4.714988608425289, "language_loss": 0.74434, "learning_rate": 3.993250845817845e-06, "loss": 0.76739556, "num_input_tokens_seen": 9170750, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 1.7109375, "step": 437, "time_per_iteration": 2.4319581985473633 }, { "auxiliary_loss_clip": 0.01215749, "auxiliary_loss_mlp": 0.01087014, "balance_loss_clip": 1.04405141, "balance_loss_mlp": 1.0473398, "epoch": 0.026333984668570568, "flos": 18806209557120.0, "grad_norm": 5.086534458318834, "language_loss": 0.91138661, "learning_rate": 3.9932197971214026e-06, "loss": 0.93441427, "num_input_tokens_seen": 9188430, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 1.6875, "step": 438, "time_per_iteration": 2.4063069820404053 }, { "auxiliary_loss_clip": 0.0109988, "auxiliary_loss_mlp": 0.01032375, "balance_loss_clip": 1.02302861, "balance_loss_mlp": 1.02063978, "epoch": 0.02639410792123854, "flos": 64568403607680.0, "grad_norm": 0.8590789451679222, "language_loss": 0.62551713, "learning_rate": 3.9931886772920735e-06, "loss": 0.64683968, "num_input_tokens_seen": 9255835, "router_z_loss_clip": 0.09326172, "router_z_loss_mlp": 0.7890625, "step": 439, "time_per_iteration": 3.1644980907440186 }, { "auxiliary_loss_clip": 0.01225435, "auxiliary_loss_mlp": 0.01080469, "balance_loss_clip": 1.03738666, "balance_loss_mlp": 1.04984474, "epoch": 0.02645423117390651, "flos": 28472263240320.0, "grad_norm": 6.737809188874736, "language_loss": 0.75231874, "learning_rate": 3.993157486330967e-06, "loss": 0.77537781, "num_input_tokens_seen": 9276835, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 1.7578125, "step": 440, "time_per_iteration": 2.4854986667633057 }, { "auxiliary_loss_clip": 0.01219877, "auxiliary_loss_mlp": 0.01067345, "balance_loss_clip": 1.02543068, "balance_loss_mlp": 1.04653168, "epoch": 0.026514354426574478, "flos": 18550261313280.0, "grad_norm": 2.6045460134394824, "language_loss": 0.82804596, "learning_rate": 3.993126224239198e-06, "loss": 0.85091817, "num_input_tokens_seen": 9295075, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 1.734375, "step": 441, "time_per_iteration": 2.4888463020324707 }, { "auxiliary_loss_clip": 0.01220228, "auxiliary_loss_mlp": 0.01074593, "balance_loss_clip": 1.0329653, "balance_loss_mlp": 1.04525423, "epoch": 0.026574477679242446, "flos": 20666766021120.0, "grad_norm": 2.4595969775424327, "language_loss": 0.78507668, "learning_rate": 3.99309489101788e-06, "loss": 0.80802488, "num_input_tokens_seen": 9314205, "router_z_loss_clip": 0.41601562, "router_z_loss_mlp": 1.75, "step": 442, "time_per_iteration": 2.4291138648986816 }, { "auxiliary_loss_clip": 0.01090726, "auxiliary_loss_mlp": 0.01011347, "balance_loss_clip": 1.00285959, "balance_loss_mlp": 1.01481843, "epoch": 0.026634600931910415, "flos": 57953026414080.0, "grad_norm": 0.9492683728905594, "language_loss": 0.644611, "learning_rate": 3.993063486668132e-06, "loss": 0.66563171, "num_input_tokens_seen": 9367395, "router_z_loss_clip": 0.08496094, "router_z_loss_mlp": 0.7578125, "step": 443, "time_per_iteration": 2.9280450344085693 }, { "auxiliary_loss_clip": 0.01222362, "auxiliary_loss_mlp": 0.01079253, "balance_loss_clip": 1.03905571, "balance_loss_mlp": 1.05135286, "epoch": 0.026694724184578387, "flos": 15814222796160.0, "grad_norm": 2.0330872338587667, "language_loss": 0.82178068, "learning_rate": 3.993032011191076e-06, "loss": 0.8447969, "num_input_tokens_seen": 9385185, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 1.7109375, "step": 444, "time_per_iteration": 2.4048736095428467 }, { "auxiliary_loss_clip": 0.01221715, "auxiliary_loss_mlp": 0.01070907, "balance_loss_clip": 1.02789617, "balance_loss_mlp": 1.04752612, "epoch": 0.026754847437246355, "flos": 23439149130240.0, "grad_norm": 2.2354540032417507, "language_loss": 0.95266509, "learning_rate": 3.993000464587833e-06, "loss": 0.9755913, "num_input_tokens_seen": 9403225, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 1.7421875, "step": 445, "time_per_iteration": 2.434636354446411 }, { "auxiliary_loss_clip": 0.01221157, "auxiliary_loss_mlp": 0.01077582, "balance_loss_clip": 1.03552508, "balance_loss_mlp": 1.04775894, "epoch": 0.026814970689914324, "flos": 17341009862400.0, "grad_norm": 2.175289182417039, "language_loss": 0.91126347, "learning_rate": 3.9929688468595305e-06, "loss": 0.93425083, "num_input_tokens_seen": 9420540, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 1.734375, "step": 446, "time_per_iteration": 2.3854146003723145 }, { "auxiliary_loss_clip": 0.01220393, "auxiliary_loss_mlp": 0.01072297, "balance_loss_clip": 1.03069329, "balance_loss_mlp": 1.04886007, "epoch": 0.026875093942582293, "flos": 17893754507520.0, "grad_norm": 2.6742440873310374, "language_loss": 0.79533404, "learning_rate": 3.992937158007296e-06, "loss": 0.81826091, "num_input_tokens_seen": 9438840, "router_z_loss_clip": 0.41601562, "router_z_loss_mlp": 1.71875, "step": 447, "time_per_iteration": 2.436859130859375 }, { "auxiliary_loss_clip": 0.01217204, "auxiliary_loss_mlp": 0.01061085, "balance_loss_clip": 1.02317667, "balance_loss_mlp": 1.0459522, "epoch": 0.026935217195250265, "flos": 21722958604800.0, "grad_norm": 2.7414730375156515, "language_loss": 0.86134863, "learning_rate": 3.992905398032262e-06, "loss": 0.88413143, "num_input_tokens_seen": 9457215, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 1.7109375, "step": 448, "time_per_iteration": 2.4112770557403564 }, { "auxiliary_loss_clip": 0.01212256, "auxiliary_loss_mlp": 0.01073785, "balance_loss_clip": 1.03573346, "balance_loss_mlp": 1.04552698, "epoch": 0.026995340447918233, "flos": 23621570317440.0, "grad_norm": 2.07756945998201, "language_loss": 0.88353348, "learning_rate": 3.992873566935559e-06, "loss": 0.90639389, "num_input_tokens_seen": 9475615, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 1.671875, "step": 449, "time_per_iteration": 3.893214225769043 }, { "auxiliary_loss_clip": 0.01223196, "auxiliary_loss_mlp": 0.01071437, "balance_loss_clip": 1.02995205, "balance_loss_mlp": 1.04884255, "epoch": 0.027055463700586202, "flos": 17930308567680.0, "grad_norm": 2.1023873559554254, "language_loss": 0.80007172, "learning_rate": 3.992841664718326e-06, "loss": 0.82301807, "num_input_tokens_seen": 9493975, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 1.7421875, "step": 450, "time_per_iteration": 2.4293415546417236 }, { "auxiliary_loss_clip": 0.01213427, "auxiliary_loss_mlp": 0.01068471, "balance_loss_clip": 1.02751112, "balance_loss_mlp": 1.04947925, "epoch": 0.02711558695325417, "flos": 25117738254720.0, "grad_norm": 1.6694600928474144, "language_loss": 0.81280768, "learning_rate": 3.9928096913817e-06, "loss": 0.83562666, "num_input_tokens_seen": 9514810, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 1.640625, "step": 451, "time_per_iteration": 5.31993293762207 }, { "auxiliary_loss_clip": 0.01217688, "auxiliary_loss_mlp": 0.01082696, "balance_loss_clip": 1.03939891, "balance_loss_mlp": 1.04973805, "epoch": 0.02717571020592214, "flos": 24238520686080.0, "grad_norm": 1.8430085290678004, "language_loss": 0.76597822, "learning_rate": 3.992777646926822e-06, "loss": 0.78898203, "num_input_tokens_seen": 9533635, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 1.6796875, "step": 452, "time_per_iteration": 2.4252092838287354 }, { "auxiliary_loss_clip": 0.01216012, "auxiliary_loss_mlp": 0.01073324, "balance_loss_clip": 1.03448617, "balance_loss_mlp": 1.04734445, "epoch": 0.02723583345859011, "flos": 25117773166080.0, "grad_norm": 1.8424704499023885, "language_loss": 0.72687912, "learning_rate": 3.992745531354836e-06, "loss": 0.74977249, "num_input_tokens_seen": 9555420, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 1.6875, "step": 453, "time_per_iteration": 3.840744733810425 }, { "auxiliary_loss_clip": 0.01213416, "auxiliary_loss_mlp": 0.01077839, "balance_loss_clip": 1.03923905, "balance_loss_mlp": 1.04579771, "epoch": 0.02729595671125808, "flos": 42739939140480.0, "grad_norm": 1.9164462172076624, "language_loss": 0.81865823, "learning_rate": 3.992713344666888e-06, "loss": 0.84157073, "num_input_tokens_seen": 9578950, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 1.671875, "step": 454, "time_per_iteration": 2.601329803466797 }, { "auxiliary_loss_clip": 0.01217129, "auxiliary_loss_mlp": 0.01059361, "balance_loss_clip": 1.02178609, "balance_loss_mlp": 1.04818177, "epoch": 0.02735607996392605, "flos": 21430002453120.0, "grad_norm": 1.8384435634960097, "language_loss": 0.75141943, "learning_rate": 3.992681086864125e-06, "loss": 0.77418435, "num_input_tokens_seen": 9598160, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 1.6875, "step": 455, "time_per_iteration": 2.434659481048584 }, { "auxiliary_loss_clip": 0.01217381, "auxiliary_loss_mlp": 0.01073987, "balance_loss_clip": 1.03364635, "balance_loss_mlp": 1.0450505, "epoch": 0.027416203216594017, "flos": 20850199637760.0, "grad_norm": 3.601715071411152, "language_loss": 0.80229902, "learning_rate": 3.992648757947702e-06, "loss": 0.82521272, "num_input_tokens_seen": 9616010, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 1.71875, "step": 456, "time_per_iteration": 2.421617031097412 }, { "auxiliary_loss_clip": 0.01210392, "auxiliary_loss_mlp": 0.0107454, "balance_loss_clip": 1.0342474, "balance_loss_mlp": 1.04569876, "epoch": 0.027476326469261986, "flos": 13223667381120.0, "grad_norm": 2.3636243778557464, "language_loss": 0.81195503, "learning_rate": 3.99261635791877e-06, "loss": 0.83480436, "num_input_tokens_seen": 9634000, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 1.6484375, "step": 457, "time_per_iteration": 2.4353220462799072 }, { "auxiliary_loss_clip": 0.01215576, "auxiliary_loss_mlp": 0.01069389, "balance_loss_clip": 1.02940655, "balance_loss_mlp": 1.04417586, "epoch": 0.027536449721929958, "flos": 24023385688320.0, "grad_norm": 2.3286607514782713, "language_loss": 0.9358151, "learning_rate": 3.992583886778485e-06, "loss": 0.95866472, "num_input_tokens_seen": 9653455, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 1.71875, "step": 458, "time_per_iteration": 2.451904058456421 }, { "auxiliary_loss_clip": 0.01211667, "auxiliary_loss_mlp": 0.01075366, "balance_loss_clip": 1.03721941, "balance_loss_mlp": 1.04523611, "epoch": 0.027596572974597926, "flos": 13005215804160.0, "grad_norm": 2.2284725754265655, "language_loss": 0.78291839, "learning_rate": 3.9925513445280075e-06, "loss": 0.80578876, "num_input_tokens_seen": 9669650, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 1.6640625, "step": 459, "time_per_iteration": 2.4427225589752197 }, { "auxiliary_loss_clip": 0.0121507, "auxiliary_loss_mlp": 0.01068839, "balance_loss_clip": 1.02799821, "balance_loss_mlp": 1.0481329, "epoch": 0.027656696227265895, "flos": 26141810520960.0, "grad_norm": 1.873779291517176, "language_loss": 0.8316347, "learning_rate": 3.9925187311684975e-06, "loss": 0.85447371, "num_input_tokens_seen": 9691415, "router_z_loss_clip": 0.40820312, "router_z_loss_mlp": 1.671875, "step": 460, "time_per_iteration": 2.481241226196289 }, { "auxiliary_loss_clip": 0.01087725, "auxiliary_loss_mlp": 0.01037586, "balance_loss_clip": 1.02890778, "balance_loss_mlp": 1.01300073, "epoch": 0.027716819479933864, "flos": 60693917610240.0, "grad_norm": 1.5945212505311077, "language_loss": 0.73599243, "learning_rate": 3.9924860467011195e-06, "loss": 0.75724554, "num_input_tokens_seen": 9755605, "router_z_loss_clip": 0.08691406, "router_z_loss_mlp": 0.75, "step": 461, "time_per_iteration": 3.049412488937378 }, { "auxiliary_loss_clip": 0.01213652, "auxiliary_loss_mlp": 0.01069088, "balance_loss_clip": 1.03408813, "balance_loss_mlp": 1.05053163, "epoch": 0.027776942732601832, "flos": 31210605907200.0, "grad_norm": 2.472476215251796, "language_loss": 0.8088612, "learning_rate": 3.99245329112704e-06, "loss": 0.83168852, "num_input_tokens_seen": 9776270, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 1.6328125, "step": 462, "time_per_iteration": 2.4856672286987305 }, { "auxiliary_loss_clip": 0.01214272, "auxiliary_loss_mlp": 0.01065799, "balance_loss_clip": 1.02932096, "balance_loss_mlp": 1.0486722, "epoch": 0.027837065985269804, "flos": 22673538725760.0, "grad_norm": 2.2505299356194177, "language_loss": 0.89811778, "learning_rate": 3.992420464447427e-06, "loss": 0.92091846, "num_input_tokens_seen": 9794465, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 1.65625, "step": 463, "time_per_iteration": 2.449388265609741 }, { "auxiliary_loss_clip": 0.01083923, "auxiliary_loss_mlp": 0.01013799, "balance_loss_clip": 1.00612223, "balance_loss_mlp": 1.01178098, "epoch": 0.027897189237937773, "flos": 62179437582720.0, "grad_norm": 0.8732072034279693, "language_loss": 0.5900414, "learning_rate": 3.992387566663454e-06, "loss": 0.6110186, "num_input_tokens_seen": 9849685, "router_z_loss_clip": 0.07666016, "router_z_loss_mlp": 0.72265625, "step": 464, "time_per_iteration": 3.076657295227051 }, { "auxiliary_loss_clip": 0.01224219, "auxiliary_loss_mlp": 0.01071273, "balance_loss_clip": 1.02985942, "balance_loss_mlp": 1.05110717, "epoch": 0.02795731249060574, "flos": 24492164780160.0, "grad_norm": 2.8476587695432993, "language_loss": 0.80872023, "learning_rate": 3.992354597776293e-06, "loss": 0.83167517, "num_input_tokens_seen": 9869505, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 1.734375, "step": 465, "time_per_iteration": 2.460120916366577 }, { "auxiliary_loss_clip": 0.01211628, "auxiliary_loss_mlp": 0.01077742, "balance_loss_clip": 1.03806865, "balance_loss_mlp": 1.04559112, "epoch": 0.02801743574327371, "flos": 23731860902400.0, "grad_norm": 2.0807330331238814, "language_loss": 0.78305185, "learning_rate": 3.992321557787121e-06, "loss": 0.80594552, "num_input_tokens_seen": 9890950, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 1.65625, "step": 466, "time_per_iteration": 2.440420627593994 }, { "auxiliary_loss_clip": 0.01210946, "auxiliary_loss_mlp": 0.01068143, "balance_loss_clip": 1.02928114, "balance_loss_mlp": 1.04549003, "epoch": 0.02807755899594168, "flos": 20628117279360.0, "grad_norm": 1.807040788688562, "language_loss": 0.87426627, "learning_rate": 3.992288446697118e-06, "loss": 0.89705718, "num_input_tokens_seen": 9911265, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 1.65625, "step": 467, "time_per_iteration": 2.429304361343384 }, { "auxiliary_loss_clip": 0.01218809, "auxiliary_loss_mlp": 0.01091612, "balance_loss_clip": 1.05005586, "balance_loss_mlp": 1.04589593, "epoch": 0.02813768224860965, "flos": 19243566558720.0, "grad_norm": 2.1615589645067237, "language_loss": 0.86052179, "learning_rate": 3.9922552645074644e-06, "loss": 0.88362604, "num_input_tokens_seen": 9929025, "router_z_loss_clip": 0.41601562, "router_z_loss_mlp": 1.7265625, "step": 468, "time_per_iteration": 2.4182615280151367 }, { "auxiliary_loss_clip": 0.01216098, "auxiliary_loss_mlp": 0.01075451, "balance_loss_clip": 1.03553975, "balance_loss_mlp": 1.04683185, "epoch": 0.02819780550127762, "flos": 20812912439040.0, "grad_norm": 2.4673660674462172, "language_loss": 0.91542101, "learning_rate": 3.992222011219346e-06, "loss": 0.93833661, "num_input_tokens_seen": 9945190, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 1.6953125, "step": 469, "time_per_iteration": 2.420708179473877 }, { "auxiliary_loss_clip": 0.01208904, "auxiliary_loss_mlp": 0.01087393, "balance_loss_clip": 1.0482924, "balance_loss_mlp": 1.04541838, "epoch": 0.028257928753945588, "flos": 19973111662080.0, "grad_norm": 3.359520458405969, "language_loss": 0.80823982, "learning_rate": 3.992188686833948e-06, "loss": 0.83120275, "num_input_tokens_seen": 9962820, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 1.6328125, "step": 470, "time_per_iteration": 2.426966428756714 }, { "auxiliary_loss_clip": 0.01214693, "auxiliary_loss_mlp": 0.01074015, "balance_loss_clip": 1.0333643, "balance_loss_mlp": 1.0463984, "epoch": 0.028318052006613557, "flos": 20483472049920.0, "grad_norm": 2.0266611816436004, "language_loss": 0.92974067, "learning_rate": 3.992155291352461e-06, "loss": 0.95262778, "num_input_tokens_seen": 9982595, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 1.6796875, "step": 471, "time_per_iteration": 2.417511224746704 }, { "auxiliary_loss_clip": 0.01211363, "auxiliary_loss_mlp": 0.01070777, "balance_loss_clip": 1.03117526, "balance_loss_mlp": 1.0441376, "epoch": 0.02837817525928153, "flos": 28513495422720.0, "grad_norm": 2.0100640893231168, "language_loss": 0.76147437, "learning_rate": 3.992121824776075e-06, "loss": 0.78429568, "num_input_tokens_seen": 10004645, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 1.671875, "step": 472, "time_per_iteration": 2.49124813079834 }, { "auxiliary_loss_clip": 0.01216394, "auxiliary_loss_mlp": 0.01074437, "balance_loss_clip": 1.03524113, "balance_loss_mlp": 1.04516983, "epoch": 0.028438298511949497, "flos": 18550680249600.0, "grad_norm": 2.9233453117850345, "language_loss": 0.9328692, "learning_rate": 3.9920882871059865e-06, "loss": 0.95577747, "num_input_tokens_seen": 10022555, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 1.7109375, "step": 473, "time_per_iteration": 2.430454730987549 }, { "auxiliary_loss_clip": 0.01219842, "auxiliary_loss_mlp": 0.01089857, "balance_loss_clip": 1.05104256, "balance_loss_mlp": 1.04805279, "epoch": 0.028498421764617466, "flos": 16909273589760.0, "grad_norm": 3.6126159175986055, "language_loss": 0.88592136, "learning_rate": 3.992054678343391e-06, "loss": 0.90901834, "num_input_tokens_seen": 10041025, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 1.71875, "step": 474, "time_per_iteration": 2.3969032764434814 }, { "auxiliary_loss_clip": 0.0121063, "auxiliary_loss_mlp": 0.01080179, "balance_loss_clip": 1.03936172, "balance_loss_mlp": 1.04794836, "epoch": 0.028558545017285435, "flos": 27777561540480.0, "grad_norm": 2.2069611979958164, "language_loss": 0.78739357, "learning_rate": 3.992020998489488e-06, "loss": 0.81030166, "num_input_tokens_seen": 10060775, "router_z_loss_clip": 0.40820312, "router_z_loss_mlp": 1.625, "step": 475, "time_per_iteration": 2.4957873821258545 }, { "auxiliary_loss_clip": 0.01078655, "auxiliary_loss_mlp": 0.0105164, "balance_loss_clip": 1.04286611, "balance_loss_mlp": 1.01238871, "epoch": 0.028618668269953403, "flos": 65651060868480.0, "grad_norm": 0.9168251895118754, "language_loss": 0.66889834, "learning_rate": 3.991987247545479e-06, "loss": 0.69020128, "num_input_tokens_seen": 10120225, "router_z_loss_clip": 0.08789062, "router_z_loss_mlp": 0.6640625, "step": 476, "time_per_iteration": 2.975771188735962 }, { "auxiliary_loss_clip": 0.01212109, "auxiliary_loss_mlp": 0.01079319, "balance_loss_clip": 1.03947902, "balance_loss_mlp": 1.04676247, "epoch": 0.028678791522621375, "flos": 21936208389120.0, "grad_norm": 2.333401231724457, "language_loss": 0.83673292, "learning_rate": 3.99195342551257e-06, "loss": 0.85964721, "num_input_tokens_seen": 10137880, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 1.65625, "step": 477, "time_per_iteration": 2.425384521484375 }, { "auxiliary_loss_clip": 0.01219363, "auxiliary_loss_mlp": 0.01076292, "balance_loss_clip": 1.03509319, "balance_loss_mlp": 1.04858422, "epoch": 0.028738914775289344, "flos": 24570963452160.0, "grad_norm": 2.376657919351714, "language_loss": 0.81632209, "learning_rate": 3.991919532391967e-06, "loss": 0.8392787, "num_input_tokens_seen": 10156930, "router_z_loss_clip": 0.41210938, "router_z_loss_mlp": 1.703125, "step": 478, "time_per_iteration": 2.463796854019165 }, { "auxiliary_loss_clip": 0.01212562, "auxiliary_loss_mlp": 0.01074185, "balance_loss_clip": 1.03467917, "balance_loss_mlp": 1.04589367, "epoch": 0.028799038027957313, "flos": 23256867588480.0, "grad_norm": 1.98261751435751, "language_loss": 0.8049897, "learning_rate": 3.991885568184879e-06, "loss": 0.82785714, "num_input_tokens_seen": 10176295, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 1.6640625, "step": 479, "time_per_iteration": 2.42974591255188 }, { "auxiliary_loss_clip": 0.01211035, "auxiliary_loss_mlp": 0.01074508, "balance_loss_clip": 1.03156877, "balance_loss_mlp": 1.04642069, "epoch": 0.02885916128062528, "flos": 22163003781120.0, "grad_norm": 2.75380698709829, "language_loss": 0.7387349, "learning_rate": 3.991851532892521e-06, "loss": 0.76159036, "num_input_tokens_seen": 10195790, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 1.6484375, "step": 480, "time_per_iteration": 2.4637868404388428 }, { "auxiliary_loss_clip": 0.01211482, "auxiliary_loss_mlp": 0.01066037, "balance_loss_clip": 1.03067946, "balance_loss_mlp": 1.04742777, "epoch": 0.02891928453329325, "flos": 22931651473920.0, "grad_norm": 1.727787042430347, "language_loss": 0.8761667, "learning_rate": 3.991817426516103e-06, "loss": 0.89894187, "num_input_tokens_seen": 10218405, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 1.640625, "step": 481, "time_per_iteration": 2.4658827781677246 }, { "auxiliary_loss_clip": 0.01075504, "auxiliary_loss_mlp": 0.01015323, "balance_loss_clip": 1.00783658, "balance_loss_mlp": 1.01079535, "epoch": 0.028979407785961222, "flos": 57430202670720.0, "grad_norm": 0.949522922299035, "language_loss": 0.66014594, "learning_rate": 3.991783249056846e-06, "loss": 0.68105423, "num_input_tokens_seen": 10271005, "router_z_loss_clip": 0.07470703, "router_z_loss_mlp": 0.6484375, "step": 482, "time_per_iteration": 2.7990095615386963 }, { "auxiliary_loss_clip": 0.01219808, "auxiliary_loss_mlp": 0.01078013, "balance_loss_clip": 1.03855503, "balance_loss_mlp": 1.04768646, "epoch": 0.02903953103862919, "flos": 16721929900800.0, "grad_norm": 2.632323154501168, "language_loss": 0.78217971, "learning_rate": 3.991749000515968e-06, "loss": 0.80515796, "num_input_tokens_seen": 10288405, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 1.71875, "step": 483, "time_per_iteration": 2.390429973602295 }, { "auxiliary_loss_clip": 0.01213693, "auxiliary_loss_mlp": 0.01084099, "balance_loss_clip": 1.04392576, "balance_loss_mlp": 1.04758871, "epoch": 0.02909965429129716, "flos": 16762708235520.0, "grad_norm": 2.6496364320357797, "language_loss": 0.74926507, "learning_rate": 3.991714680894691e-06, "loss": 0.77224296, "num_input_tokens_seen": 10306875, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 1.65625, "step": 484, "time_per_iteration": 2.4088382720947266 }, { "auxiliary_loss_clip": 0.0121332, "auxiliary_loss_mlp": 0.0107373, "balance_loss_clip": 1.03410459, "balance_loss_mlp": 1.04610586, "epoch": 0.029159777543965128, "flos": 19784511164160.0, "grad_norm": 2.0936797742723923, "language_loss": 0.83411169, "learning_rate": 3.991680290194241e-06, "loss": 0.85698223, "num_input_tokens_seen": 10323965, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 1.671875, "step": 485, "time_per_iteration": 2.4037091732025146 }, { "auxiliary_loss_clip": 0.01216594, "auxiliary_loss_mlp": 0.01070947, "balance_loss_clip": 1.03299022, "balance_loss_mlp": 1.05073392, "epoch": 0.029219900796633096, "flos": 19641751148160.0, "grad_norm": 1.8682562352937333, "language_loss": 0.83862162, "learning_rate": 3.991645828415844e-06, "loss": 0.86149704, "num_input_tokens_seen": 10342620, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 1.65625, "step": 486, "time_per_iteration": 2.408877372741699 }, { "auxiliary_loss_clip": 0.01212274, "auxiliary_loss_mlp": 0.01086459, "balance_loss_clip": 1.04642892, "balance_loss_mlp": 1.04686427, "epoch": 0.02928002404930107, "flos": 25884500734080.0, "grad_norm": 2.1830433578473407, "language_loss": 0.88530236, "learning_rate": 3.991611295560732e-06, "loss": 0.90828967, "num_input_tokens_seen": 10364610, "router_z_loss_clip": 0.40039062, "router_z_loss_mlp": 1.65625, "step": 487, "time_per_iteration": 2.484448194503784 }, { "auxiliary_loss_clip": 0.01215214, "auxiliary_loss_mlp": 0.01064765, "balance_loss_clip": 1.02649832, "balance_loss_mlp": 1.04820597, "epoch": 0.029340147301969037, "flos": 20659399724160.0, "grad_norm": 4.5142003549058325, "language_loss": 0.87968355, "learning_rate": 3.991576691630134e-06, "loss": 0.90248334, "num_input_tokens_seen": 10380910, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 1.671875, "step": 488, "time_per_iteration": 2.423797607421875 }, { "auxiliary_loss_clip": 0.01209508, "auxiliary_loss_mlp": 0.01070475, "balance_loss_clip": 1.03206539, "balance_loss_mlp": 1.04734719, "epoch": 0.029400270554637006, "flos": 24426806981760.0, "grad_norm": 7.590058532803281, "language_loss": 0.88534021, "learning_rate": 3.991542016625289e-06, "loss": 0.90814012, "num_input_tokens_seen": 10400665, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 1.625, "step": 489, "time_per_iteration": 3.889488458633423 }, { "auxiliary_loss_clip": 0.01204857, "auxiliary_loss_mlp": 0.01073573, "balance_loss_clip": 1.03483009, "balance_loss_mlp": 1.04278088, "epoch": 0.029460393807304974, "flos": 20119851573120.0, "grad_norm": 1.9148623633063457, "language_loss": 0.88380492, "learning_rate": 3.99150727054743e-06, "loss": 0.90658921, "num_input_tokens_seen": 10420150, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 1.625, "step": 490, "time_per_iteration": 5.217238187789917 }, { "auxiliary_loss_clip": 0.012128, "auxiliary_loss_mlp": 0.01068954, "balance_loss_clip": 1.03185582, "balance_loss_mlp": 1.04768872, "epoch": 0.029520517059972943, "flos": 17674953816960.0, "grad_norm": 3.2853586318233647, "language_loss": 0.91206759, "learning_rate": 3.9914724533978e-06, "loss": 0.93488508, "num_input_tokens_seen": 10438210, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 1.6484375, "step": 491, "time_per_iteration": 2.4247934818267822 }, { "auxiliary_loss_clip": 0.0120445, "auxiliary_loss_mlp": 0.01066569, "balance_loss_clip": 1.03044844, "balance_loss_mlp": 1.04490709, "epoch": 0.029580640312640915, "flos": 18952181418240.0, "grad_norm": 2.3581296560745586, "language_loss": 0.85065138, "learning_rate": 3.991437565177642e-06, "loss": 0.87336159, "num_input_tokens_seen": 10455125, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 1.59375, "step": 492, "time_per_iteration": 3.793816089630127 }, { "auxiliary_loss_clip": 0.0121162, "auxiliary_loss_mlp": 0.01078789, "balance_loss_clip": 1.03990269, "balance_loss_mlp": 1.047333, "epoch": 0.029640763565308884, "flos": 18725351114880.0, "grad_norm": 4.274363010344045, "language_loss": 0.83796686, "learning_rate": 3.991402605888198e-06, "loss": 0.8608709, "num_input_tokens_seen": 10470990, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 1.640625, "step": 493, "time_per_iteration": 2.3795204162597656 }, { "auxiliary_loss_clip": 0.01205968, "auxiliary_loss_mlp": 0.0106367, "balance_loss_clip": 1.02585661, "balance_loss_mlp": 1.04183817, "epoch": 0.029700886817976852, "flos": 20594251393920.0, "grad_norm": 1.7770761529375936, "language_loss": 0.86436814, "learning_rate": 3.991367575530719e-06, "loss": 0.88706452, "num_input_tokens_seen": 10490685, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 1.640625, "step": 494, "time_per_iteration": 2.4433693885803223 }, { "auxiliary_loss_clip": 0.01209465, "auxiliary_loss_mlp": 0.01067738, "balance_loss_clip": 1.03328609, "balance_loss_mlp": 1.04554904, "epoch": 0.02976101007064482, "flos": 22235762787840.0, "grad_norm": 2.355404434060518, "language_loss": 0.86683035, "learning_rate": 3.9913324741064535e-06, "loss": 0.88960236, "num_input_tokens_seen": 10509435, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 1.6328125, "step": 495, "time_per_iteration": 2.4427053928375244 }, { "auxiliary_loss_clip": 0.01204284, "auxiliary_loss_mlp": 0.01065663, "balance_loss_clip": 1.0279218, "balance_loss_mlp": 1.04706717, "epoch": 0.029821133323312793, "flos": 23731511788800.0, "grad_norm": 1.962976155669888, "language_loss": 0.61746514, "learning_rate": 3.991297301616653e-06, "loss": 0.64016461, "num_input_tokens_seen": 10530050, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 1.5703125, "step": 496, "time_per_iteration": 2.4484548568725586 }, { "auxiliary_loss_clip": 0.01204454, "auxiliary_loss_mlp": 0.01066235, "balance_loss_clip": 1.03109229, "balance_loss_mlp": 1.04732299, "epoch": 0.02988125657598076, "flos": 22418393443200.0, "grad_norm": 1.729398170481444, "language_loss": 0.8813799, "learning_rate": 3.991262058062575e-06, "loss": 0.90408683, "num_input_tokens_seen": 10551370, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 1.5703125, "step": 497, "time_per_iteration": 2.5011401176452637 }, { "auxiliary_loss_clip": 0.0120954, "auxiliary_loss_mlp": 0.01069592, "balance_loss_clip": 1.03247035, "balance_loss_mlp": 1.04552197, "epoch": 0.02994137982864873, "flos": 13844248531200.0, "grad_norm": 2.6030676199930602, "language_loss": 0.84617573, "learning_rate": 3.991226743445477e-06, "loss": 0.86896706, "num_input_tokens_seen": 10569225, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 1.640625, "step": 498, "time_per_iteration": 2.522249221801758 }, { "auxiliary_loss_clip": 0.01206957, "auxiliary_loss_mlp": 0.01070789, "balance_loss_clip": 1.03540766, "balance_loss_mlp": 1.04600763, "epoch": 0.0300015030813167, "flos": 23907404551680.0, "grad_norm": 6.119065223903574, "language_loss": 0.78805482, "learning_rate": 3.991191357766617e-06, "loss": 0.81083238, "num_input_tokens_seen": 10586170, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 1.609375, "step": 499, "time_per_iteration": 2.5096781253814697 }, { "auxiliary_loss_clip": 0.01208767, "auxiliary_loss_mlp": 0.01069197, "balance_loss_clip": 1.03391087, "balance_loss_mlp": 1.04836917, "epoch": 0.030061626333984667, "flos": 22015740199680.0, "grad_norm": 1.90305937173952, "language_loss": 0.82357585, "learning_rate": 3.991155901027261e-06, "loss": 0.84635556, "num_input_tokens_seen": 10606205, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 1.6015625, "step": 500, "time_per_iteration": 2.4628171920776367 }, { "auxiliary_loss_clip": 0.01200767, "auxiliary_loss_mlp": 0.01073662, "balance_loss_clip": 1.0358727, "balance_loss_mlp": 1.04500973, "epoch": 0.03012174958665264, "flos": 23038625479680.0, "grad_norm": 2.535327279683379, "language_loss": 0.8793115, "learning_rate": 3.991120373228672e-06, "loss": 0.90205586, "num_input_tokens_seen": 10625995, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 1.5546875, "step": 501, "time_per_iteration": 2.4772112369537354 }, { "auxiliary_loss_clip": 0.01207747, "auxiliary_loss_mlp": 0.01063189, "balance_loss_clip": 1.0281651, "balance_loss_mlp": 1.04432535, "epoch": 0.030181872839320608, "flos": 18952251240960.0, "grad_norm": 2.510197501519828, "language_loss": 0.86130059, "learning_rate": 3.991084774372118e-06, "loss": 0.88400996, "num_input_tokens_seen": 10644105, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 1.6328125, "step": 502, "time_per_iteration": 2.4320156574249268 }, { "auxiliary_loss_clip": 0.01205823, "auxiliary_loss_mlp": 0.01068997, "balance_loss_clip": 1.0321852, "balance_loss_mlp": 1.04956007, "epoch": 0.030241996091988577, "flos": 16727061870720.0, "grad_norm": 2.3152219203109867, "language_loss": 0.8469739, "learning_rate": 3.991049104458871e-06, "loss": 0.86972207, "num_input_tokens_seen": 10661090, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 1.5625, "step": 503, "time_per_iteration": 2.4763457775115967 }, { "auxiliary_loss_clip": 0.01199079, "auxiliary_loss_mlp": 0.01069118, "balance_loss_clip": 1.03326011, "balance_loss_mlp": 1.04358077, "epoch": 0.030302119344656545, "flos": 28620015580800.0, "grad_norm": 2.3787165165537334, "language_loss": 0.88057989, "learning_rate": 3.991013363490202e-06, "loss": 0.90326184, "num_input_tokens_seen": 10682380, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 1.5546875, "step": 504, "time_per_iteration": 2.498375654220581 }, { "auxiliary_loss_clip": 0.01201017, "auxiliary_loss_mlp": 0.01058491, "balance_loss_clip": 1.02403963, "balance_loss_mlp": 1.04350054, "epoch": 0.030362242597324514, "flos": 15668425491840.0, "grad_norm": 2.3046057205140835, "language_loss": 0.77504301, "learning_rate": 3.9909775514673885e-06, "loss": 0.79763812, "num_input_tokens_seen": 10699925, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 1.578125, "step": 505, "time_per_iteration": 2.5052103996276855 }, { "auxiliary_loss_clip": 0.01077902, "auxiliary_loss_mlp": 0.01015402, "balance_loss_clip": 1.0084877, "balance_loss_mlp": 1.01223004, "epoch": 0.030422365849992486, "flos": 72122107034880.0, "grad_norm": 0.841552000589698, "language_loss": 0.54989272, "learning_rate": 3.990941668391708e-06, "loss": 0.57082576, "num_input_tokens_seen": 10766525, "router_z_loss_clip": 0.06933594, "router_z_loss_mlp": 0.65625, "step": 506, "time_per_iteration": 3.3168389797210693 }, { "auxiliary_loss_clip": 0.01204627, "auxiliary_loss_mlp": 0.01071427, "balance_loss_clip": 1.03626084, "balance_loss_mlp": 1.04464555, "epoch": 0.030482489102660455, "flos": 19426790707200.0, "grad_norm": 2.1395024000496523, "language_loss": 0.83131456, "learning_rate": 3.99090571426444e-06, "loss": 0.85407519, "num_input_tokens_seen": 10786725, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 1.6015625, "step": 507, "time_per_iteration": 2.4338178634643555 }, { "auxiliary_loss_clip": 0.01205381, "auxiliary_loss_mlp": 0.01067995, "balance_loss_clip": 1.03156519, "balance_loss_mlp": 1.04393148, "epoch": 0.030542612355328423, "flos": 20374787387520.0, "grad_norm": 2.278497143714966, "language_loss": 0.87978184, "learning_rate": 3.990869689086868e-06, "loss": 0.90251565, "num_input_tokens_seen": 10805390, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 1.609375, "step": 508, "time_per_iteration": 2.524904489517212 }, { "auxiliary_loss_clip": 0.01206445, "auxiliary_loss_mlp": 0.01057358, "balance_loss_clip": 1.02202475, "balance_loss_mlp": 1.04496431, "epoch": 0.030602735607996392, "flos": 34675945148160.0, "grad_norm": 1.9490258310118795, "language_loss": 0.71126789, "learning_rate": 3.990833592860279e-06, "loss": 0.73390591, "num_input_tokens_seen": 10828030, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 1.6171875, "step": 509, "time_per_iteration": 2.56103777885437 }, { "auxiliary_loss_clip": 0.01200486, "auxiliary_loss_mlp": 0.01061738, "balance_loss_clip": 1.02790678, "balance_loss_mlp": 1.04403806, "epoch": 0.03066285886066436, "flos": 23657565795840.0, "grad_norm": 2.0781282707932016, "language_loss": 0.81924725, "learning_rate": 3.990797425585959e-06, "loss": 0.84186947, "num_input_tokens_seen": 10845240, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 1.5625, "step": 510, "time_per_iteration": 2.517728805541992 }, { "auxiliary_loss_clip": 0.01205482, "auxiliary_loss_mlp": 0.01062984, "balance_loss_clip": 1.02786517, "balance_loss_mlp": 1.04849553, "epoch": 0.030722982113332332, "flos": 23001861951360.0, "grad_norm": 2.2066502130873005, "language_loss": 0.83270842, "learning_rate": 3.9907611872652e-06, "loss": 0.85539317, "num_input_tokens_seen": 10864325, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 1.5703125, "step": 511, "time_per_iteration": 2.4717540740966797 }, { "auxiliary_loss_clip": 0.01203057, "auxiliary_loss_mlp": 0.01067064, "balance_loss_clip": 1.03380466, "balance_loss_mlp": 1.04298186, "epoch": 0.0307831053660003, "flos": 24749788769280.0, "grad_norm": 2.1679052234875598, "language_loss": 0.817909, "learning_rate": 3.990724877899296e-06, "loss": 0.84061021, "num_input_tokens_seen": 10883860, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 1.6015625, "step": 512, "time_per_iteration": 2.5127251148223877 }, { "auxiliary_loss_clip": 0.01199515, "auxiliary_loss_mlp": 0.01066431, "balance_loss_clip": 1.02985787, "balance_loss_mlp": 1.04325438, "epoch": 0.03084322861866827, "flos": 26139680928000.0, "grad_norm": 1.8918276302591983, "language_loss": 0.86687189, "learning_rate": 3.990688497489541e-06, "loss": 0.88953137, "num_input_tokens_seen": 10904555, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 1.5625, "step": 513, "time_per_iteration": 2.4929020404815674 }, { "auxiliary_loss_clip": 0.01207001, "auxiliary_loss_mlp": 0.0106846, "balance_loss_clip": 1.03360355, "balance_loss_mlp": 1.04672039, "epoch": 0.03090335187133624, "flos": 18770283901440.0, "grad_norm": 1.5700776694139793, "language_loss": 0.78757954, "learning_rate": 3.990652046037234e-06, "loss": 0.81033409, "num_input_tokens_seen": 10923700, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 1.609375, "step": 514, "time_per_iteration": 2.515777111053467 }, { "auxiliary_loss_clip": 0.01199241, "auxiliary_loss_mlp": 0.0106353, "balance_loss_clip": 1.0313673, "balance_loss_mlp": 1.04487944, "epoch": 0.030963475124004207, "flos": 23220767376000.0, "grad_norm": 3.2378638813729714, "language_loss": 0.76955855, "learning_rate": 3.990615523543677e-06, "loss": 0.79218626, "num_input_tokens_seen": 10942730, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 1.546875, "step": 515, "time_per_iteration": 2.475161075592041 }, { "auxiliary_loss_clip": 0.01198662, "auxiliary_loss_mlp": 0.01059837, "balance_loss_clip": 1.02440774, "balance_loss_mlp": 1.0403347, "epoch": 0.03102359837667218, "flos": 42523861536000.0, "grad_norm": 3.4578561136515913, "language_loss": 0.82421023, "learning_rate": 3.990578930010171e-06, "loss": 0.8467952, "num_input_tokens_seen": 10967120, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 1.5859375, "step": 516, "time_per_iteration": 2.6455483436584473 }, { "auxiliary_loss_clip": 0.01197751, "auxiliary_loss_mlp": 0.01061516, "balance_loss_clip": 1.0260396, "balance_loss_mlp": 1.04530215, "epoch": 0.031083721629340148, "flos": 21175939422720.0, "grad_norm": 1.725880235288346, "language_loss": 0.78557986, "learning_rate": 3.990542265438024e-06, "loss": 0.80817252, "num_input_tokens_seen": 10986775, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 1.5234375, "step": 517, "time_per_iteration": 2.499014139175415 }, { "auxiliary_loss_clip": 0.01194293, "auxiliary_loss_mlp": 0.01056726, "balance_loss_clip": 1.02375305, "balance_loss_mlp": 1.04368186, "epoch": 0.031143844882008116, "flos": 29714891817600.0, "grad_norm": 1.5230128811737134, "language_loss": 0.9046182, "learning_rate": 3.990505529828544e-06, "loss": 0.92712843, "num_input_tokens_seen": 11011360, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 1.5078125, "step": 518, "time_per_iteration": 2.533905029296875 }, { "auxiliary_loss_clip": 0.01208601, "auxiliary_loss_mlp": 0.01072366, "balance_loss_clip": 1.03586388, "balance_loss_mlp": 1.04859257, "epoch": 0.031203968134676085, "flos": 23111349575040.0, "grad_norm": 3.0270756451628125, "language_loss": 0.86141729, "learning_rate": 3.9904687231830424e-06, "loss": 0.88422704, "num_input_tokens_seen": 11030150, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 1.6015625, "step": 519, "time_per_iteration": 2.497805118560791 }, { "auxiliary_loss_clip": 0.01199843, "auxiliary_loss_mlp": 0.01066668, "balance_loss_clip": 1.03123891, "balance_loss_mlp": 1.04227221, "epoch": 0.03126409138734405, "flos": 20953473039360.0, "grad_norm": 2.5245551099703847, "language_loss": 0.86705911, "learning_rate": 3.990431845502831e-06, "loss": 0.88972425, "num_input_tokens_seen": 11049145, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 1.578125, "step": 520, "time_per_iteration": 2.4443423748016357 }, { "auxiliary_loss_clip": 0.01202682, "auxiliary_loss_mlp": 0.01074112, "balance_loss_clip": 1.03861165, "balance_loss_mlp": 1.04245722, "epoch": 0.031324214640012026, "flos": 21649117345920.0, "grad_norm": 1.7543559854752624, "language_loss": 0.89257371, "learning_rate": 3.990394896789228e-06, "loss": 0.91534168, "num_input_tokens_seen": 11068835, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 1.6015625, "step": 521, "time_per_iteration": 2.474047899246216 }, { "auxiliary_loss_clip": 0.01196938, "auxiliary_loss_mlp": 0.01070582, "balance_loss_clip": 1.03491497, "balance_loss_mlp": 1.04307437, "epoch": 0.03138433789267999, "flos": 23440196471040.0, "grad_norm": 2.037463426747011, "language_loss": 0.70534217, "learning_rate": 3.9903578770435505e-06, "loss": 0.72801739, "num_input_tokens_seen": 11088980, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 1.5390625, "step": 522, "time_per_iteration": 2.466909170150757 }, { "auxiliary_loss_clip": 0.01202988, "auxiliary_loss_mlp": 0.0106513, "balance_loss_clip": 1.03029704, "balance_loss_mlp": 1.04272258, "epoch": 0.03144446114534796, "flos": 18981369181440.0, "grad_norm": 4.602702188577186, "language_loss": 0.84968263, "learning_rate": 3.99032078626712e-06, "loss": 0.87236381, "num_input_tokens_seen": 11104300, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 1.6015625, "step": 523, "time_per_iteration": 2.43253231048584 }, { "auxiliary_loss_clip": 0.01203881, "auxiliary_loss_mlp": 0.01065043, "balance_loss_clip": 1.03044844, "balance_loss_mlp": 1.04492521, "epoch": 0.031504584398015935, "flos": 22636600640640.0, "grad_norm": 2.931477636015078, "language_loss": 0.89870876, "learning_rate": 3.990283624461261e-06, "loss": 0.92139804, "num_input_tokens_seen": 11123335, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 1.59375, "step": 524, "time_per_iteration": 2.4421632289886475 }, { "auxiliary_loss_clip": 0.01205348, "auxiliary_loss_mlp": 0.01072081, "balance_loss_clip": 1.03605592, "balance_loss_mlp": 1.04450417, "epoch": 0.0315647076506839, "flos": 25296004990080.0, "grad_norm": 3.1770234004138236, "language_loss": 0.79840553, "learning_rate": 3.9902463916273e-06, "loss": 0.82117987, "num_input_tokens_seen": 11140880, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 1.609375, "step": 525, "time_per_iteration": 2.512315034866333 }, { "auxiliary_loss_clip": 0.01197829, "auxiliary_loss_mlp": 0.01059955, "balance_loss_clip": 1.02438259, "balance_loss_mlp": 1.04082167, "epoch": 0.03162483090335187, "flos": 16981892951040.0, "grad_norm": 1.974038699837512, "language_loss": 0.80296195, "learning_rate": 3.990209087766563e-06, "loss": 0.82553983, "num_input_tokens_seen": 11158710, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 1.5703125, "step": 526, "time_per_iteration": 2.4241766929626465 }, { "auxiliary_loss_clip": 0.01206056, "auxiliary_loss_mlp": 0.01059794, "balance_loss_clip": 1.02348268, "balance_loss_mlp": 1.04648757, "epoch": 0.03168495415601984, "flos": 18733485461760.0, "grad_norm": 2.082035333094545, "language_loss": 0.81417549, "learning_rate": 3.990171712880383e-06, "loss": 0.83683401, "num_input_tokens_seen": 11177550, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 1.59375, "step": 527, "time_per_iteration": 2.393235921859741 }, { "auxiliary_loss_clip": 0.01193488, "auxiliary_loss_mlp": 0.01064555, "balance_loss_clip": 1.03058052, "balance_loss_mlp": 1.03917575, "epoch": 0.03174507740868781, "flos": 21213820114560.0, "grad_norm": 1.9535105862033473, "language_loss": 0.93562591, "learning_rate": 3.990134266970095e-06, "loss": 0.95820642, "num_input_tokens_seen": 11196230, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 1.546875, "step": 528, "time_per_iteration": 3.870699644088745 }, { "auxiliary_loss_clip": 0.01199748, "auxiliary_loss_mlp": 0.01055606, "balance_loss_clip": 1.02167869, "balance_loss_mlp": 1.04302394, "epoch": 0.03180520066135578, "flos": 24786587208960.0, "grad_norm": 1.9424774422442086, "language_loss": 0.83959383, "learning_rate": 3.9900967500370335e-06, "loss": 0.86214739, "num_input_tokens_seen": 11214935, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 1.5625, "step": 529, "time_per_iteration": 2.437253952026367 }, { "auxiliary_loss_clip": 0.01198756, "auxiliary_loss_mlp": 0.01064429, "balance_loss_clip": 1.03158712, "balance_loss_mlp": 1.04470348, "epoch": 0.03186532391402375, "flos": 24863081731200.0, "grad_norm": 2.219900736369779, "language_loss": 0.90270782, "learning_rate": 3.990059162082539e-06, "loss": 0.9253397, "num_input_tokens_seen": 11235310, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 1.5390625, "step": 530, "time_per_iteration": 5.275681972503662 }, { "auxiliary_loss_clip": 0.01192801, "auxiliary_loss_mlp": 0.01061732, "balance_loss_clip": 1.02620769, "balance_loss_mlp": 1.03718972, "epoch": 0.03192544716669172, "flos": 21213994671360.0, "grad_norm": 2.2176436304325775, "language_loss": 0.76117861, "learning_rate": 3.9900215031079515e-06, "loss": 0.78372395, "num_input_tokens_seen": 11254425, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 1.5546875, "step": 531, "time_per_iteration": 2.421377420425415 }, { "auxiliary_loss_clip": 0.0119355, "auxiliary_loss_mlp": 0.01057381, "balance_loss_clip": 1.02307296, "balance_loss_mlp": 1.04110742, "epoch": 0.03198557041935969, "flos": 24352058027520.0, "grad_norm": 2.0795541844250462, "language_loss": 0.904203, "learning_rate": 3.989983773114616e-06, "loss": 0.92671233, "num_input_tokens_seen": 11274595, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 1.5234375, "step": 532, "time_per_iteration": 3.8595516681671143 }, { "auxiliary_loss_clip": 0.01078112, "auxiliary_loss_mlp": 0.0100629, "balance_loss_clip": 0.99875575, "balance_loss_mlp": 1.01723647, "epoch": 0.032045693672027656, "flos": 61824056186880.0, "grad_norm": 0.7257353694709775, "language_loss": 0.5794214, "learning_rate": 3.989945972103877e-06, "loss": 0.60026538, "num_input_tokens_seen": 11336705, "router_z_loss_clip": 0.07519531, "router_z_loss_mlp": 0.609375, "step": 533, "time_per_iteration": 3.104865550994873 }, { "auxiliary_loss_clip": 0.01193289, "auxiliary_loss_mlp": 0.01067067, "balance_loss_clip": 1.03375959, "balance_loss_mlp": 1.04044938, "epoch": 0.03210581692469563, "flos": 28399958081280.0, "grad_norm": 1.6510738722888407, "language_loss": 0.8620894, "learning_rate": 3.989908100077087e-06, "loss": 0.88469297, "num_input_tokens_seen": 11356820, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 1.5234375, "step": 534, "time_per_iteration": 2.4982473850250244 }, { "auxiliary_loss_clip": 0.01195723, "auxiliary_loss_mlp": 0.01056269, "balance_loss_clip": 1.02060103, "balance_loss_mlp": 1.04258776, "epoch": 0.03216594017736359, "flos": 24716551288320.0, "grad_norm": 2.2136679875308443, "language_loss": 0.7724539, "learning_rate": 3.989870157035594e-06, "loss": 0.79497379, "num_input_tokens_seen": 11376645, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 1.53125, "step": 535, "time_per_iteration": 2.4496872425079346 }, { "auxiliary_loss_clip": 0.01195513, "auxiliary_loss_mlp": 0.0106181, "balance_loss_clip": 1.02802587, "balance_loss_mlp": 1.04010534, "epoch": 0.032226063430031565, "flos": 31174121669760.0, "grad_norm": 2.1996955541292476, "language_loss": 0.80698258, "learning_rate": 3.989832142980754e-06, "loss": 0.82955575, "num_input_tokens_seen": 11397310, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 1.5546875, "step": 536, "time_per_iteration": 2.5376601219177246 }, { "auxiliary_loss_clip": 0.01194075, "auxiliary_loss_mlp": 0.01062913, "balance_loss_clip": 1.02779412, "balance_loss_mlp": 1.04131877, "epoch": 0.03228618668269954, "flos": 32196832392960.0, "grad_norm": 1.9626146690772939, "language_loss": 0.69564807, "learning_rate": 3.989794057913923e-06, "loss": 0.71821791, "num_input_tokens_seen": 11418475, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 1.53125, "step": 537, "time_per_iteration": 2.4927241802215576 }, { "auxiliary_loss_clip": 0.01197973, "auxiliary_loss_mlp": 0.0106739, "balance_loss_clip": 1.03355789, "balance_loss_mlp": 1.04507327, "epoch": 0.0323463099353675, "flos": 22669174805760.0, "grad_norm": 2.1653692978086414, "language_loss": 0.82236588, "learning_rate": 3.9897559018364615e-06, "loss": 0.84501946, "num_input_tokens_seen": 11436630, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 1.53125, "step": 538, "time_per_iteration": 2.4447684288024902 }, { "auxiliary_loss_clip": 0.01201374, "auxiliary_loss_mlp": 0.01062495, "balance_loss_clip": 1.02725673, "balance_loss_mlp": 1.04155743, "epoch": 0.032406433188035474, "flos": 26903999612160.0, "grad_norm": 1.7500385242252072, "language_loss": 0.79262614, "learning_rate": 3.98971767474973e-06, "loss": 0.81526482, "num_input_tokens_seen": 11457275, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 1.6015625, "step": 539, "time_per_iteration": 2.4402785301208496 }, { "auxiliary_loss_clip": 0.0119521, "auxiliary_loss_mlp": 0.01064609, "balance_loss_clip": 1.02927542, "balance_loss_mlp": 1.04309082, "epoch": 0.03246655644070344, "flos": 31502584540800.0, "grad_norm": 3.1364036052751243, "language_loss": 0.77135301, "learning_rate": 3.989679376655092e-06, "loss": 0.79395115, "num_input_tokens_seen": 11476925, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 1.5234375, "step": 540, "time_per_iteration": 2.5234434604644775 }, { "auxiliary_loss_clip": 0.01202534, "auxiliary_loss_mlp": 0.01065013, "balance_loss_clip": 1.02903533, "balance_loss_mlp": 1.04551792, "epoch": 0.03252667969337141, "flos": 23217311151360.0, "grad_norm": 2.7523998804954344, "language_loss": 0.85123587, "learning_rate": 3.989641007553916e-06, "loss": 0.87391126, "num_input_tokens_seen": 11496830, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 1.5703125, "step": 541, "time_per_iteration": 2.4256014823913574 }, { "auxiliary_loss_clip": 0.01195641, "auxiliary_loss_mlp": 0.01061551, "balance_loss_clip": 1.02438188, "balance_loss_mlp": 1.04368758, "epoch": 0.032586802946039384, "flos": 14756563935360.0, "grad_norm": 2.2643202941631486, "language_loss": 0.88175774, "learning_rate": 3.989602567447569e-06, "loss": 0.90432966, "num_input_tokens_seen": 11515605, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 1.515625, "step": 542, "time_per_iteration": 2.4088408946990967 }, { "auxiliary_loss_clip": 0.01196528, "auxiliary_loss_mlp": 0.01065616, "balance_loss_clip": 1.0302583, "balance_loss_mlp": 1.04263008, "epoch": 0.03264692619870735, "flos": 24279508488960.0, "grad_norm": 1.975439138667125, "language_loss": 0.70890611, "learning_rate": 3.989564056337426e-06, "loss": 0.73152757, "num_input_tokens_seen": 11536230, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 1.5390625, "step": 543, "time_per_iteration": 2.4294607639312744 }, { "auxiliary_loss_clip": 0.0119393, "auxiliary_loss_mlp": 0.01060699, "balance_loss_clip": 1.0262711, "balance_loss_mlp": 1.03910398, "epoch": 0.03270704945137532, "flos": 22892060125440.0, "grad_norm": 2.6781412261206756, "language_loss": 0.91309845, "learning_rate": 3.989525474224858e-06, "loss": 0.93564469, "num_input_tokens_seen": 11554715, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 1.546875, "step": 544, "time_per_iteration": 2.438074827194214 }, { "auxiliary_loss_clip": 0.01197985, "auxiliary_loss_mlp": 0.01053669, "balance_loss_clip": 1.02126789, "balance_loss_mlp": 1.04320121, "epoch": 0.032767172704043286, "flos": 18040040570880.0, "grad_norm": 3.2598794138189326, "language_loss": 0.65982533, "learning_rate": 3.989486821111244e-06, "loss": 0.68234193, "num_input_tokens_seen": 11571370, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 1.546875, "step": 545, "time_per_iteration": 2.3907058238983154 }, { "auxiliary_loss_clip": 0.01197121, "auxiliary_loss_mlp": 0.01058319, "balance_loss_clip": 1.02453566, "balance_loss_mlp": 1.04238844, "epoch": 0.03282729595671126, "flos": 22527636687360.0, "grad_norm": 2.2970789725101652, "language_loss": 0.91792428, "learning_rate": 3.9894480969979635e-06, "loss": 0.94047862, "num_input_tokens_seen": 11588560, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 1.546875, "step": 546, "time_per_iteration": 2.4523770809173584 }, { "auxiliary_loss_clip": 0.01193368, "auxiliary_loss_mlp": 0.01060539, "balance_loss_clip": 1.02456117, "balance_loss_mlp": 1.03936839, "epoch": 0.03288741920937923, "flos": 20409630791040.0, "grad_norm": 3.55375524184824, "language_loss": 0.81702125, "learning_rate": 3.989409301886398e-06, "loss": 0.83956033, "num_input_tokens_seen": 11605685, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 1.5390625, "step": 547, "time_per_iteration": 2.4157586097717285 }, { "auxiliary_loss_clip": 0.01195076, "auxiliary_loss_mlp": 0.01056152, "balance_loss_clip": 1.02089, "balance_loss_mlp": 1.0420146, "epoch": 0.032947542462047195, "flos": 20776916960640.0, "grad_norm": 1.8755243921553955, "language_loss": 0.80964327, "learning_rate": 3.989370435777931e-06, "loss": 0.83215559, "num_input_tokens_seen": 11626290, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 1.53125, "step": 548, "time_per_iteration": 2.4845027923583984 }, { "auxiliary_loss_clip": 0.01196537, "auxiliary_loss_mlp": 0.01059157, "balance_loss_clip": 1.02194011, "balance_loss_mlp": 1.04265714, "epoch": 0.03300766571471517, "flos": 19900247921280.0, "grad_norm": 4.870889560058078, "language_loss": 0.67086864, "learning_rate": 3.989331498673951e-06, "loss": 0.69342566, "num_input_tokens_seen": 11643950, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 1.5390625, "step": 549, "time_per_iteration": 2.459944009780884 }, { "auxiliary_loss_clip": 0.01191409, "auxiliary_loss_mlp": 0.01063525, "balance_loss_clip": 1.02819109, "balance_loss_mlp": 1.04134452, "epoch": 0.03306778896738313, "flos": 17966792805120.0, "grad_norm": 2.04638296747371, "language_loss": 0.85720515, "learning_rate": 3.9892924905758475e-06, "loss": 0.87975454, "num_input_tokens_seen": 11662560, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 1.5, "step": 550, "time_per_iteration": 2.4711291790008545 }, { "auxiliary_loss_clip": 0.01194971, "auxiliary_loss_mlp": 0.01069821, "balance_loss_clip": 1.03541744, "balance_loss_mlp": 1.04545546, "epoch": 0.033127912220051105, "flos": 21652294279680.0, "grad_norm": 1.7071244062597453, "language_loss": 0.81147861, "learning_rate": 3.989253411485011e-06, "loss": 0.83412647, "num_input_tokens_seen": 11682265, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 1.5, "step": 551, "time_per_iteration": 2.4221384525299072 }, { "auxiliary_loss_clip": 0.01197454, "auxiliary_loss_mlp": 0.01065646, "balance_loss_clip": 1.03043151, "balance_loss_mlp": 1.04241097, "epoch": 0.03318803547271908, "flos": 30187127134080.0, "grad_norm": 2.8232966739904155, "language_loss": 0.86134279, "learning_rate": 3.989214261402838e-06, "loss": 0.88397378, "num_input_tokens_seen": 11699300, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 1.5546875, "step": 552, "time_per_iteration": 2.518507719039917 }, { "auxiliary_loss_clip": 0.01197266, "auxiliary_loss_mlp": 0.01063903, "balance_loss_clip": 1.02642334, "balance_loss_mlp": 1.04250526, "epoch": 0.03324815872538704, "flos": 20374996855680.0, "grad_norm": 2.3913134793234097, "language_loss": 0.92376202, "learning_rate": 3.989175040330724e-06, "loss": 0.9463737, "num_input_tokens_seen": 11716955, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 1.546875, "step": 553, "time_per_iteration": 2.4046833515167236 }, { "auxiliary_loss_clip": 0.01197331, "auxiliary_loss_mlp": 0.01065382, "balance_loss_clip": 1.02694941, "balance_loss_mlp": 1.04425418, "epoch": 0.033308281978055014, "flos": 24493526323200.0, "grad_norm": 2.3784682061349676, "language_loss": 0.78795719, "learning_rate": 3.98913574827007e-06, "loss": 0.81058431, "num_input_tokens_seen": 11736130, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 1.53125, "step": 554, "time_per_iteration": 2.4820733070373535 }, { "auxiliary_loss_clip": 0.0119128, "auxiliary_loss_mlp": 0.01069872, "balance_loss_clip": 1.03301299, "balance_loss_mlp": 1.04178536, "epoch": 0.03336840523072298, "flos": 23399313402240.0, "grad_norm": 2.431197399805768, "language_loss": 0.81781608, "learning_rate": 3.989096385222278e-06, "loss": 0.84042764, "num_input_tokens_seen": 11754425, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 1.4921875, "step": 555, "time_per_iteration": 2.4461193084716797 }, { "auxiliary_loss_clip": 0.01195652, "auxiliary_loss_mlp": 0.01064587, "balance_loss_clip": 1.02787089, "balance_loss_mlp": 1.04220772, "epoch": 0.03342852848339095, "flos": 30549386067840.0, "grad_norm": 2.9516103816303603, "language_loss": 0.88176799, "learning_rate": 3.989056951188753e-06, "loss": 0.90437037, "num_input_tokens_seen": 11772845, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 1.53125, "step": 556, "time_per_iteration": 2.528106689453125 }, { "auxiliary_loss_clip": 0.01194919, "auxiliary_loss_mlp": 0.01068536, "balance_loss_clip": 1.03270173, "balance_loss_mlp": 1.04206967, "epoch": 0.03348865173605892, "flos": 22892199770880.0, "grad_norm": 1.9707480879266779, "language_loss": 0.83577824, "learning_rate": 3.989017446170901e-06, "loss": 0.85841274, "num_input_tokens_seen": 11792850, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 1.53125, "step": 557, "time_per_iteration": 2.4847426414489746 }, { "auxiliary_loss_clip": 0.0119705, "auxiliary_loss_mlp": 0.0105911, "balance_loss_clip": 1.02418184, "balance_loss_mlp": 1.04479933, "epoch": 0.03354877498872689, "flos": 17675058551040.0, "grad_norm": 3.8172882719549515, "language_loss": 0.93698788, "learning_rate": 3.988977870170133e-06, "loss": 0.95954949, "num_input_tokens_seen": 11809670, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 1.5234375, "step": 558, "time_per_iteration": 2.411505699157715 }, { "auxiliary_loss_clip": 0.01193043, "auxiliary_loss_mlp": 0.01063056, "balance_loss_clip": 1.02886689, "balance_loss_mlp": 1.04174602, "epoch": 0.03360889824139486, "flos": 21651910254720.0, "grad_norm": 6.270277986351199, "language_loss": 0.76974529, "learning_rate": 3.988938223187861e-06, "loss": 0.7923063, "num_input_tokens_seen": 11829665, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 1.515625, "step": 559, "time_per_iteration": 2.419795036315918 }, { "auxiliary_loss_clip": 0.01193975, "auxiliary_loss_mlp": 0.01067361, "balance_loss_clip": 1.03369641, "balance_loss_mlp": 1.04131722, "epoch": 0.033669021494062826, "flos": 21794740093440.0, "grad_norm": 2.815050991405286, "language_loss": 0.87094873, "learning_rate": 3.9888985052255005e-06, "loss": 0.89356208, "num_input_tokens_seen": 11848190, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 1.53125, "step": 560, "time_per_iteration": 2.4539504051208496 }, { "auxiliary_loss_clip": 0.01190911, "auxiliary_loss_mlp": 0.01056281, "balance_loss_clip": 1.02352214, "balance_loss_mlp": 1.04093623, "epoch": 0.0337291447467308, "flos": 21865299684480.0, "grad_norm": 3.150492713638745, "language_loss": 0.80860865, "learning_rate": 3.988858716284468e-06, "loss": 0.83108056, "num_input_tokens_seen": 11864795, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 1.5, "step": 561, "time_per_iteration": 2.392932891845703 }, { "auxiliary_loss_clip": 0.01192948, "auxiliary_loss_mlp": 0.01064501, "balance_loss_clip": 1.0318135, "balance_loss_mlp": 1.04135442, "epoch": 0.03378926799939877, "flos": 24244734908160.0, "grad_norm": 1.7802178247380682, "language_loss": 0.81872559, "learning_rate": 3.988818856366184e-06, "loss": 0.84130007, "num_input_tokens_seen": 11885275, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 1.515625, "step": 562, "time_per_iteration": 2.4758028984069824 }, { "auxiliary_loss_clip": 0.01197868, "auxiliary_loss_mlp": 0.01073311, "balance_loss_clip": 1.03945541, "balance_loss_mlp": 1.0440259, "epoch": 0.033849391252066735, "flos": 16506899637120.0, "grad_norm": 1.9287515962020005, "language_loss": 0.83921456, "learning_rate": 3.9887789254720704e-06, "loss": 0.86192638, "num_input_tokens_seen": 11903595, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 1.5390625, "step": 563, "time_per_iteration": 2.3996310234069824 }, { "auxiliary_loss_clip": 0.01195139, "auxiliary_loss_mlp": 0.01065566, "balance_loss_clip": 1.02963662, "balance_loss_mlp": 1.04248786, "epoch": 0.03390951450473471, "flos": 15668390580480.0, "grad_norm": 2.33407814831408, "language_loss": 0.93336153, "learning_rate": 3.988738923603553e-06, "loss": 0.95596856, "num_input_tokens_seen": 11917815, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 1.5234375, "step": 564, "time_per_iteration": 2.3867733478546143 }, { "auxiliary_loss_clip": 0.01194909, "auxiliary_loss_mlp": 0.01066091, "balance_loss_clip": 1.03147316, "balance_loss_mlp": 1.03992391, "epoch": 0.03396963775740267, "flos": 22673678371200.0, "grad_norm": 3.090948314160313, "language_loss": 0.94309002, "learning_rate": 3.98869885076206e-06, "loss": 0.96570009, "num_input_tokens_seen": 11936305, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 1.546875, "step": 565, "time_per_iteration": 2.473637104034424 }, { "auxiliary_loss_clip": 0.01080434, "auxiliary_loss_mlp": 0.01023652, "balance_loss_clip": 1.01535463, "balance_loss_mlp": 1.01858997, "epoch": 0.034029761010070644, "flos": 64388984797440.0, "grad_norm": 1.148770634874019, "language_loss": 0.54870236, "learning_rate": 3.9886587069490195e-06, "loss": 0.56974322, "num_input_tokens_seen": 11998940, "router_z_loss_clip": 0.08300781, "router_z_loss_mlp": 0.6171875, "step": 566, "time_per_iteration": 3.11896014213562 }, { "auxiliary_loss_clip": 0.01195853, "auxiliary_loss_mlp": 0.01062324, "balance_loss_clip": 1.0245589, "balance_loss_mlp": 1.04374218, "epoch": 0.034089884262738616, "flos": 25003188483840.0, "grad_norm": 2.358639810820114, "language_loss": 0.76279438, "learning_rate": 3.988618492165865e-06, "loss": 0.78537619, "num_input_tokens_seen": 12018860, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 1.515625, "step": 567, "time_per_iteration": 3.895256280899048 }, { "auxiliary_loss_clip": 0.01189289, "auxiliary_loss_mlp": 0.01067985, "balance_loss_clip": 1.03274679, "balance_loss_mlp": 1.04122317, "epoch": 0.03415000751540658, "flos": 28437838773120.0, "grad_norm": 2.024071973675408, "language_loss": 0.80621415, "learning_rate": 3.988578206414032e-06, "loss": 0.82878685, "num_input_tokens_seen": 12039675, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 1.484375, "step": 568, "time_per_iteration": 2.4694976806640625 }, { "auxiliary_loss_clip": 0.01193456, "auxiliary_loss_mlp": 0.01061731, "balance_loss_clip": 1.02859068, "balance_loss_mlp": 1.0454756, "epoch": 0.034210130768074554, "flos": 21467708588160.0, "grad_norm": 1.9569454038555405, "language_loss": 0.8628267, "learning_rate": 3.988537849694959e-06, "loss": 0.8853786, "num_input_tokens_seen": 12057680, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 1.484375, "step": 569, "time_per_iteration": 3.8459970951080322 }, { "auxiliary_loss_clip": 0.0119573, "auxiliary_loss_mlp": 0.0106184, "balance_loss_clip": 1.02598178, "balance_loss_mlp": 1.04251552, "epoch": 0.03427025402074252, "flos": 18696512465280.0, "grad_norm": 1.8131955516308138, "language_loss": 0.95423174, "learning_rate": 3.988497422010084e-06, "loss": 0.97680748, "num_input_tokens_seen": 12076135, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 1.53125, "step": 570, "time_per_iteration": 3.7499139308929443 }, { "auxiliary_loss_clip": 0.01191599, "auxiliary_loss_mlp": 0.01061289, "balance_loss_clip": 1.02357149, "balance_loss_mlp": 1.03753138, "epoch": 0.03433037727341049, "flos": 20848942828800.0, "grad_norm": 2.495821873687206, "language_loss": 0.79018605, "learning_rate": 3.988456923360852e-06, "loss": 0.81271493, "num_input_tokens_seen": 12094785, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 1.5390625, "step": 571, "time_per_iteration": 3.85359787940979 }, { "auxiliary_loss_clip": 0.01195057, "auxiliary_loss_mlp": 0.01068013, "balance_loss_clip": 1.03172517, "balance_loss_mlp": 1.04233098, "epoch": 0.03439050052607846, "flos": 25409123395200.0, "grad_norm": 2.40142219818747, "language_loss": 0.80008596, "learning_rate": 3.988416353748707e-06, "loss": 0.82271665, "num_input_tokens_seen": 12114590, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 1.53125, "step": 572, "time_per_iteration": 2.4332334995269775 }, { "auxiliary_loss_clip": 0.01198195, "auxiliary_loss_mlp": 0.01057882, "balance_loss_clip": 1.02295363, "balance_loss_mlp": 1.04672611, "epoch": 0.03445062377874643, "flos": 17639167806720.0, "grad_norm": 2.9297505078403385, "language_loss": 0.84247696, "learning_rate": 3.988375713175097e-06, "loss": 0.86503768, "num_input_tokens_seen": 12132390, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 1.515625, "step": 573, "time_per_iteration": 2.392399787902832 }, { "auxiliary_loss_clip": 0.01193016, "auxiliary_loss_mlp": 0.01064463, "balance_loss_clip": 1.0300827, "balance_loss_mlp": 1.04029369, "epoch": 0.0345107470314144, "flos": 16763546108160.0, "grad_norm": 2.3845255873691547, "language_loss": 0.76166523, "learning_rate": 3.988335001641473e-06, "loss": 0.78423995, "num_input_tokens_seen": 12149035, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 1.53125, "step": 574, "time_per_iteration": 2.3899247646331787 }, { "auxiliary_loss_clip": 0.01194761, "auxiliary_loss_mlp": 0.01054424, "balance_loss_clip": 1.02185607, "balance_loss_mlp": 1.04313254, "epoch": 0.03457087028408237, "flos": 14683560549120.0, "grad_norm": 2.8759034033766717, "language_loss": 0.83530688, "learning_rate": 3.988294219149287e-06, "loss": 0.8577987, "num_input_tokens_seen": 12167530, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 1.515625, "step": 575, "time_per_iteration": 2.404356002807617 }, { "auxiliary_loss_clip": 0.01191842, "auxiliary_loss_mlp": 0.01066171, "balance_loss_clip": 1.03207731, "balance_loss_mlp": 1.04431152, "epoch": 0.03463099353675034, "flos": 20010259215360.0, "grad_norm": 2.1975776036879133, "language_loss": 0.83930761, "learning_rate": 3.9882533656999945e-06, "loss": 0.86188769, "num_input_tokens_seen": 12186340, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 1.4765625, "step": 576, "time_per_iteration": 2.404324769973755 }, { "auxiliary_loss_clip": 0.01192762, "auxiliary_loss_mlp": 0.01065608, "balance_loss_clip": 1.03306341, "balance_loss_mlp": 1.04672194, "epoch": 0.03469111678941831, "flos": 25299984885120.0, "grad_norm": 2.136277743417185, "language_loss": 0.86451602, "learning_rate": 3.988212441295054e-06, "loss": 0.88709974, "num_input_tokens_seen": 12204090, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 1.4609375, "step": 577, "time_per_iteration": 2.4424991607666016 }, { "auxiliary_loss_clip": 0.01193216, "auxiliary_loss_mlp": 0.0106795, "balance_loss_clip": 1.03554869, "balance_loss_mlp": 1.04519367, "epoch": 0.034751240042086275, "flos": 23258264042880.0, "grad_norm": 2.0062231336852197, "language_loss": 0.72245854, "learning_rate": 3.9881714459359255e-06, "loss": 0.74507022, "num_input_tokens_seen": 12224850, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 1.484375, "step": 578, "time_per_iteration": 2.41383695602417 }, { "auxiliary_loss_clip": 0.01192622, "auxiliary_loss_mlp": 0.010612, "balance_loss_clip": 1.02734423, "balance_loss_mlp": 1.04283249, "epoch": 0.03481136329475425, "flos": 23768100760320.0, "grad_norm": 1.9369968368063495, "language_loss": 0.77471632, "learning_rate": 3.988130379624073e-06, "loss": 0.7972545, "num_input_tokens_seen": 12244935, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 1.5, "step": 579, "time_per_iteration": 2.420255422592163 }, { "auxiliary_loss_clip": 0.01192129, "auxiliary_loss_mlp": 0.01062744, "balance_loss_clip": 1.02752972, "balance_loss_mlp": 1.04205263, "epoch": 0.03487148654742222, "flos": 20156475456000.0, "grad_norm": 2.55101409008302, "language_loss": 0.86368865, "learning_rate": 3.988089242360961e-06, "loss": 0.88623732, "num_input_tokens_seen": 12262140, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 1.5, "step": 580, "time_per_iteration": 2.4236578941345215 }, { "auxiliary_loss_clip": 0.01196551, "auxiliary_loss_mlp": 0.01058375, "balance_loss_clip": 1.02592683, "balance_loss_mlp": 1.04238605, "epoch": 0.034931609800090184, "flos": 15668669871360.0, "grad_norm": 2.332495529883519, "language_loss": 0.82363093, "learning_rate": 3.988048034148057e-06, "loss": 0.8461802, "num_input_tokens_seen": 12280930, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 1.5390625, "step": 581, "time_per_iteration": 2.393260955810547 }, { "auxiliary_loss_clip": 0.0119314, "auxiliary_loss_mlp": 0.01060886, "balance_loss_clip": 1.02884281, "balance_loss_mlp": 1.04386783, "epoch": 0.034991733052758156, "flos": 16361451446400.0, "grad_norm": 2.557332275981054, "language_loss": 0.76911843, "learning_rate": 3.988006754986834e-06, "loss": 0.7916587, "num_input_tokens_seen": 12299125, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 1.4921875, "step": 582, "time_per_iteration": 2.378066062927246 }, { "auxiliary_loss_clip": 0.01193353, "auxiliary_loss_mlp": 0.01065852, "balance_loss_clip": 1.03054273, "balance_loss_mlp": 1.04665709, "epoch": 0.03505185630542612, "flos": 19386396397440.0, "grad_norm": 2.182466588591563, "language_loss": 0.87432832, "learning_rate": 3.987965404878763e-06, "loss": 0.89692038, "num_input_tokens_seen": 12316905, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 1.46875, "step": 583, "time_per_iteration": 2.4018166065216064 }, { "auxiliary_loss_clip": 0.0119538, "auxiliary_loss_mlp": 0.01065533, "balance_loss_clip": 1.03060496, "balance_loss_mlp": 1.0420208, "epoch": 0.03511197955809409, "flos": 21322784067840.0, "grad_norm": 2.4167703716027362, "language_loss": 0.80618572, "learning_rate": 3.987923983825321e-06, "loss": 0.82879484, "num_input_tokens_seen": 12335070, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 1.53125, "step": 584, "time_per_iteration": 2.3943240642547607 }, { "auxiliary_loss_clip": 0.01192112, "auxiliary_loss_mlp": 0.01060208, "balance_loss_clip": 1.02647233, "balance_loss_mlp": 1.04246902, "epoch": 0.035172102810762065, "flos": 14135738405760.0, "grad_norm": 5.747430825665412, "language_loss": 0.92533493, "learning_rate": 3.9878824918279845e-06, "loss": 0.9478581, "num_input_tokens_seen": 12350315, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 1.5, "step": 585, "time_per_iteration": 2.3824782371520996 }, { "auxiliary_loss_clip": 0.01193938, "auxiliary_loss_mlp": 0.01061532, "balance_loss_clip": 1.02808166, "balance_loss_mlp": 1.04445136, "epoch": 0.03523222606343003, "flos": 20296023626880.0, "grad_norm": 2.282708667245842, "language_loss": 0.87457907, "learning_rate": 3.9878409288882364e-06, "loss": 0.89713371, "num_input_tokens_seen": 12366030, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 1.5, "step": 586, "time_per_iteration": 2.3860182762145996 }, { "auxiliary_loss_clip": 0.01198949, "auxiliary_loss_mlp": 0.01060182, "balance_loss_clip": 1.02730393, "balance_loss_mlp": 1.04620695, "epoch": 0.035292349316098, "flos": 20374787387520.0, "grad_norm": 1.9012974248253003, "language_loss": 0.76167411, "learning_rate": 3.987799295007558e-06, "loss": 0.7842654, "num_input_tokens_seen": 12384895, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 1.53125, "step": 587, "time_per_iteration": 2.4081978797912598 }, { "auxiliary_loss_clip": 0.01193189, "auxiliary_loss_mlp": 0.01059531, "balance_loss_clip": 1.02417374, "balance_loss_mlp": 1.04124045, "epoch": 0.03535247256876597, "flos": 21467848233600.0, "grad_norm": 1.754797954220294, "language_loss": 0.78395927, "learning_rate": 3.987757590187436e-06, "loss": 0.80648649, "num_input_tokens_seen": 12404980, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 1.5234375, "step": 588, "time_per_iteration": 2.410745620727539 }, { "auxiliary_loss_clip": 0.01197837, "auxiliary_loss_mlp": 0.01064675, "balance_loss_clip": 1.02633774, "balance_loss_mlp": 1.04193711, "epoch": 0.03541259582143394, "flos": 23621919431040.0, "grad_norm": 2.7924246346076744, "language_loss": 0.93870485, "learning_rate": 3.987715814429359e-06, "loss": 0.96132994, "num_input_tokens_seen": 12423835, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 1.5625, "step": 589, "time_per_iteration": 2.423064947128296 }, { "auxiliary_loss_clip": 0.01199126, "auxiliary_loss_mlp": 0.01063146, "balance_loss_clip": 1.02962434, "balance_loss_mlp": 1.04581523, "epoch": 0.03547271907410191, "flos": 33725050824960.0, "grad_norm": 2.876152200613965, "language_loss": 0.83852893, "learning_rate": 3.987673967734818e-06, "loss": 0.86115164, "num_input_tokens_seen": 12443135, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 1.5390625, "step": 590, "time_per_iteration": 2.5066776275634766 }, { "auxiliary_loss_clip": 0.01190277, "auxiliary_loss_mlp": 0.01061021, "balance_loss_clip": 1.02862036, "balance_loss_mlp": 1.04210639, "epoch": 0.03553284232676988, "flos": 21141619689600.0, "grad_norm": 2.041992487210075, "language_loss": 0.86693615, "learning_rate": 3.987632050105306e-06, "loss": 0.88944912, "num_input_tokens_seen": 12462895, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 1.484375, "step": 591, "time_per_iteration": 2.4259727001190186 }, { "auxiliary_loss_clip": 0.01194508, "auxiliary_loss_mlp": 0.01070402, "balance_loss_clip": 1.034091, "balance_loss_mlp": 1.04161382, "epoch": 0.03559296557943785, "flos": 20045591377920.0, "grad_norm": 2.0630956610298865, "language_loss": 0.82878077, "learning_rate": 3.987590061542319e-06, "loss": 0.85142994, "num_input_tokens_seen": 12481515, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 1.53125, "step": 592, "time_per_iteration": 2.3929755687713623 }, { "auxiliary_loss_clip": 0.01072431, "auxiliary_loss_mlp": 0.01007916, "balance_loss_clip": 1.00109756, "balance_loss_mlp": 1.01578867, "epoch": 0.035653088832105814, "flos": 60331239740160.0, "grad_norm": 0.8880340868301633, "language_loss": 0.59840667, "learning_rate": 3.987548002047354e-06, "loss": 0.61921012, "num_input_tokens_seen": 12548220, "router_z_loss_clip": 0.06835938, "router_z_loss_mlp": 0.56640625, "step": 593, "time_per_iteration": 3.1164820194244385 }, { "auxiliary_loss_clip": 0.01193037, "auxiliary_loss_mlp": 0.01062465, "balance_loss_clip": 1.02639222, "balance_loss_mlp": 1.04370463, "epoch": 0.035713212084773786, "flos": 20112310719360.0, "grad_norm": 2.1609314140189433, "language_loss": 0.8677175, "learning_rate": 3.987505871621915e-06, "loss": 0.89027262, "num_input_tokens_seen": 12566105, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 1.4921875, "step": 594, "time_per_iteration": 2.3931171894073486 }, { "auxiliary_loss_clip": 0.01194122, "auxiliary_loss_mlp": 0.01063129, "balance_loss_clip": 1.02932084, "balance_loss_mlp": 1.04368234, "epoch": 0.03577333533744176, "flos": 26284605448320.0, "grad_norm": 1.9805413200314534, "language_loss": 0.84035844, "learning_rate": 3.987463670267502e-06, "loss": 0.86293095, "num_input_tokens_seen": 12586680, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 1.5078125, "step": 595, "time_per_iteration": 2.455754280090332 }, { "auxiliary_loss_clip": 0.01191518, "auxiliary_loss_mlp": 0.01065451, "balance_loss_clip": 1.03123832, "balance_loss_mlp": 1.04367185, "epoch": 0.035833458590109724, "flos": 10888955475840.0, "grad_norm": 2.8540243898721607, "language_loss": 0.9549948, "learning_rate": 3.987421397985625e-06, "loss": 0.97756451, "num_input_tokens_seen": 12601605, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 1.4765625, "step": 596, "time_per_iteration": 2.373399257659912 }, { "auxiliary_loss_clip": 0.0119328, "auxiliary_loss_mlp": 0.01066329, "balance_loss_clip": 1.0362879, "balance_loss_mlp": 1.04383075, "epoch": 0.035893581842777696, "flos": 22089127610880.0, "grad_norm": 7.13882265388366, "language_loss": 0.82787955, "learning_rate": 3.98737905477779e-06, "loss": 0.85047561, "num_input_tokens_seen": 12620365, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.5, "step": 597, "time_per_iteration": 2.412506580352783 }, { "auxiliary_loss_clip": 0.01191588, "auxiliary_loss_mlp": 0.0106389, "balance_loss_clip": 1.02886677, "balance_loss_mlp": 1.04515767, "epoch": 0.03595370509544566, "flos": 23037263936640.0, "grad_norm": 1.9765483286078758, "language_loss": 0.81232685, "learning_rate": 3.987336640645508e-06, "loss": 0.83488166, "num_input_tokens_seen": 12641140, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 1.46875, "step": 598, "time_per_iteration": 2.4125797748565674 }, { "auxiliary_loss_clip": 0.01189158, "auxiliary_loss_mlp": 0.01063772, "balance_loss_clip": 1.02927327, "balance_loss_mlp": 1.04330945, "epoch": 0.03601382834811363, "flos": 20776672581120.0, "grad_norm": 1.919076803637372, "language_loss": 0.81268477, "learning_rate": 3.987294155590295e-06, "loss": 0.83521414, "num_input_tokens_seen": 12661080, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 1.453125, "step": 599, "time_per_iteration": 2.4201183319091797 }, { "auxiliary_loss_clip": 0.01195125, "auxiliary_loss_mlp": 0.01064769, "balance_loss_clip": 1.03124774, "balance_loss_mlp": 1.04239631, "epoch": 0.036073951600781605, "flos": 23950487036160.0, "grad_norm": 2.7999164737974818, "language_loss": 0.85811245, "learning_rate": 3.987251599613664e-06, "loss": 0.88071138, "num_input_tokens_seen": 12678270, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 1.53125, "step": 600, "time_per_iteration": 2.4171955585479736 }, { "auxiliary_loss_clip": 0.01190231, "auxiliary_loss_mlp": 0.01068489, "balance_loss_clip": 1.03244054, "balance_loss_mlp": 1.04157639, "epoch": 0.03613407485344957, "flos": 18911403083520.0, "grad_norm": 2.2791175764803575, "language_loss": 0.81738359, "learning_rate": 3.987208972717135e-06, "loss": 0.83997083, "num_input_tokens_seen": 12697295, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 1.484375, "step": 601, "time_per_iteration": 2.4082608222961426 }, { "auxiliary_loss_clip": 0.01187035, "auxiliary_loss_mlp": 0.01053517, "balance_loss_clip": 1.02047205, "balance_loss_mlp": 1.04106665, "epoch": 0.03619419810611754, "flos": 23037438493440.0, "grad_norm": 2.7534643093390185, "language_loss": 0.75187588, "learning_rate": 3.987166274902231e-06, "loss": 0.77428138, "num_input_tokens_seen": 12716165, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 1.4609375, "step": 602, "time_per_iteration": 2.407188892364502 }, { "auxiliary_loss_clip": 0.01185516, "auxiliary_loss_mlp": 0.01061491, "balance_loss_clip": 1.02737272, "balance_loss_mlp": 1.04045725, "epoch": 0.03625432135878551, "flos": 29456569601280.0, "grad_norm": 2.065777789073131, "language_loss": 0.79639304, "learning_rate": 3.987123506170473e-06, "loss": 0.81886303, "num_input_tokens_seen": 12735475, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 1.453125, "step": 603, "time_per_iteration": 2.4654366970062256 }, { "auxiliary_loss_clip": 0.01189171, "auxiliary_loss_mlp": 0.01056694, "balance_loss_clip": 1.02481782, "balance_loss_mlp": 1.04381037, "epoch": 0.03631444461145348, "flos": 23507544216960.0, "grad_norm": 1.8258687398138511, "language_loss": 0.86671007, "learning_rate": 3.987080666523389e-06, "loss": 0.88916874, "num_input_tokens_seen": 12754540, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 1.453125, "step": 604, "time_per_iteration": 2.4170491695404053 }, { "auxiliary_loss_clip": 0.01192464, "auxiliary_loss_mlp": 0.01061029, "balance_loss_clip": 1.02710176, "balance_loss_mlp": 1.04573047, "epoch": 0.03637456786412145, "flos": 16617190222080.0, "grad_norm": 2.399624764457191, "language_loss": 0.80515403, "learning_rate": 3.987037755962506e-06, "loss": 0.82768893, "num_input_tokens_seen": 12773050, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 1.46875, "step": 605, "time_per_iteration": 2.4204325675964355 }, { "auxiliary_loss_clip": 0.01190658, "auxiliary_loss_mlp": 0.01063163, "balance_loss_clip": 1.03068995, "balance_loss_mlp": 1.04383564, "epoch": 0.03643469111678942, "flos": 15850916501760.0, "grad_norm": 2.413621551612539, "language_loss": 0.85129428, "learning_rate": 3.986994774489359e-06, "loss": 0.87383258, "num_input_tokens_seen": 12791240, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 1.46875, "step": 606, "time_per_iteration": 3.8434371948242188 }, { "auxiliary_loss_clip": 0.01193657, "auxiliary_loss_mlp": 0.01066992, "balance_loss_clip": 1.03149116, "balance_loss_mlp": 1.04402304, "epoch": 0.03649481436945739, "flos": 23619335990400.0, "grad_norm": 5.737261281953924, "language_loss": 0.8204093, "learning_rate": 3.986951722105479e-06, "loss": 0.84301579, "num_input_tokens_seen": 12812245, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 1.4921875, "step": 607, "time_per_iteration": 2.426976442337036 }, { "auxiliary_loss_clip": 0.01191488, "auxiliary_loss_mlp": 0.01062892, "balance_loss_clip": 1.03022838, "balance_loss_mlp": 1.04630244, "epoch": 0.036554937622125354, "flos": 21754694897280.0, "grad_norm": 3.23599896243447, "language_loss": 0.83184808, "learning_rate": 3.986908598812402e-06, "loss": 0.85439193, "num_input_tokens_seen": 12831085, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 1.453125, "step": 608, "time_per_iteration": 2.4064114093780518 }, { "auxiliary_loss_clip": 0.01190389, "auxiliary_loss_mlp": 0.01057344, "balance_loss_clip": 1.02160549, "balance_loss_mlp": 1.04478228, "epoch": 0.036615060874793326, "flos": 17818865907840.0, "grad_norm": 2.8337069348270045, "language_loss": 0.81716424, "learning_rate": 3.986865404611669e-06, "loss": 0.83964157, "num_input_tokens_seen": 12849115, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 1.453125, "step": 609, "time_per_iteration": 3.824737548828125 }, { "auxiliary_loss_clip": 0.0119741, "auxiliary_loss_mlp": 0.0107586, "balance_loss_clip": 1.04396009, "balance_loss_mlp": 1.0502708, "epoch": 0.0366751841274613, "flos": 26752791047040.0, "grad_norm": 1.9162090268784777, "language_loss": 0.79127526, "learning_rate": 3.98682213950482e-06, "loss": 0.814008, "num_input_tokens_seen": 12868005, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 1.46875, "step": 610, "time_per_iteration": 2.4709088802337646 }, { "auxiliary_loss_clip": 0.0119466, "auxiliary_loss_mlp": 0.01063045, "balance_loss_clip": 1.02954769, "balance_loss_mlp": 1.04573941, "epoch": 0.03673530738012926, "flos": 22195961971200.0, "grad_norm": 2.3790511540834864, "language_loss": 0.87558019, "learning_rate": 3.986778803493401e-06, "loss": 0.89815724, "num_input_tokens_seen": 12886890, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 1.484375, "step": 611, "time_per_iteration": 3.8363935947418213 }, { "auxiliary_loss_clip": 0.01190653, "auxiliary_loss_mlp": 0.01072803, "balance_loss_clip": 1.03725433, "balance_loss_mlp": 1.04390585, "epoch": 0.036795430632797235, "flos": 24680485987200.0, "grad_norm": 2.235179936131584, "language_loss": 0.72158015, "learning_rate": 3.986735396578956e-06, "loss": 0.74421477, "num_input_tokens_seen": 12906130, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 1.46875, "step": 612, "time_per_iteration": 2.440546989440918 }, { "auxiliary_loss_clip": 0.01189836, "auxiliary_loss_mlp": 0.01059669, "balance_loss_clip": 1.02497888, "balance_loss_mlp": 1.04274333, "epoch": 0.0368555538854652, "flos": 17747957203200.0, "grad_norm": 3.122582402275691, "language_loss": 0.79163623, "learning_rate": 3.986691918763034e-06, "loss": 0.81413126, "num_input_tokens_seen": 12925260, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 1.46875, "step": 613, "time_per_iteration": 2.392113208770752 }, { "auxiliary_loss_clip": 0.01188545, "auxiliary_loss_mlp": 0.01073452, "balance_loss_clip": 1.0383811, "balance_loss_mlp": 1.04238963, "epoch": 0.03691567713813317, "flos": 20593518255360.0, "grad_norm": 1.9597988037603629, "language_loss": 0.93362963, "learning_rate": 3.98664837004719e-06, "loss": 0.95624959, "num_input_tokens_seen": 12944590, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 1.4609375, "step": 614, "time_per_iteration": 2.461533308029175 }, { "auxiliary_loss_clip": 0.01193101, "auxiliary_loss_mlp": 0.01070644, "balance_loss_clip": 1.03383267, "balance_loss_mlp": 1.04623306, "epoch": 0.036975800390801145, "flos": 33649149795840.0, "grad_norm": 2.718999563947092, "language_loss": 0.73057652, "learning_rate": 3.986604750432974e-06, "loss": 0.753214, "num_input_tokens_seen": 12964785, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 1.46875, "step": 615, "time_per_iteration": 2.5033388137817383 }, { "auxiliary_loss_clip": 0.01193439, "auxiliary_loss_mlp": 0.01063049, "balance_loss_clip": 1.02983749, "balance_loss_mlp": 1.04255581, "epoch": 0.03703592364346911, "flos": 28292425493760.0, "grad_norm": 2.5644905920844607, "language_loss": 0.81399232, "learning_rate": 3.986561059921947e-06, "loss": 0.83655715, "num_input_tokens_seen": 12986705, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 1.5078125, "step": 616, "time_per_iteration": 2.496835231781006 }, { "auxiliary_loss_clip": 0.0118991, "auxiliary_loss_mlp": 0.01069732, "balance_loss_clip": 1.03745008, "balance_loss_mlp": 1.04359233, "epoch": 0.03709604689613708, "flos": 31502863831680.0, "grad_norm": 2.289703141175505, "language_loss": 0.67923647, "learning_rate": 3.986517298515664e-06, "loss": 0.70183289, "num_input_tokens_seen": 13010560, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 1.4609375, "step": 617, "time_per_iteration": 2.4922900199890137 }, { "auxiliary_loss_clip": 0.01193598, "auxiliary_loss_mlp": 0.01065017, "balance_loss_clip": 1.02799022, "balance_loss_mlp": 1.04717958, "epoch": 0.03715617014880505, "flos": 19608374021760.0, "grad_norm": 2.4143698713390025, "language_loss": 0.79980433, "learning_rate": 3.9864734662156884e-06, "loss": 0.82239044, "num_input_tokens_seen": 13028935, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 1.46875, "step": 618, "time_per_iteration": 2.4274299144744873 }, { "auxiliary_loss_clip": 0.01195582, "auxiliary_loss_mlp": 0.01072538, "balance_loss_clip": 1.03458214, "balance_loss_mlp": 1.04337478, "epoch": 0.03721629340147302, "flos": 15923291483520.0, "grad_norm": 2.799342678041712, "language_loss": 0.91307116, "learning_rate": 3.986429563023585e-06, "loss": 0.93575239, "num_input_tokens_seen": 13046000, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 1.5234375, "step": 619, "time_per_iteration": 2.3912317752838135 }, { "auxiliary_loss_clip": 0.01192505, "auxiliary_loss_mlp": 0.01069496, "balance_loss_clip": 1.03688025, "balance_loss_mlp": 1.04688096, "epoch": 0.03727641665414099, "flos": 21103075681920.0, "grad_norm": 2.9199639193863978, "language_loss": 0.94099218, "learning_rate": 3.986385588940921e-06, "loss": 0.9636122, "num_input_tokens_seen": 13062995, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 1.453125, "step": 620, "time_per_iteration": 2.392190933227539 }, { "auxiliary_loss_clip": 0.0118776, "auxiliary_loss_mlp": 0.0106845, "balance_loss_clip": 1.03087556, "balance_loss_mlp": 1.04218006, "epoch": 0.037336539906808956, "flos": 24130604073600.0, "grad_norm": 1.7018149861947345, "language_loss": 0.76863194, "learning_rate": 3.986341543969264e-06, "loss": 0.79119402, "num_input_tokens_seen": 13084120, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 1.453125, "step": 621, "time_per_iteration": 2.441282033920288 }, { "auxiliary_loss_clip": 0.01191084, "auxiliary_loss_mlp": 0.01061931, "balance_loss_clip": 1.02786076, "balance_loss_mlp": 1.04571021, "epoch": 0.03739666315947693, "flos": 22345285322880.0, "grad_norm": 2.809079720400529, "language_loss": 0.8644613, "learning_rate": 3.986297428110187e-06, "loss": 0.88699144, "num_input_tokens_seen": 13100035, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 1.453125, "step": 622, "time_per_iteration": 2.4315948486328125 }, { "auxiliary_loss_clip": 0.01194275, "auxiliary_loss_mlp": 0.01059122, "balance_loss_clip": 1.02452755, "balance_loss_mlp": 1.04649282, "epoch": 0.0374567864121449, "flos": 20448454089600.0, "grad_norm": 2.2307814029927964, "language_loss": 0.89798784, "learning_rate": 3.986253241365264e-06, "loss": 0.9205218, "num_input_tokens_seen": 13118070, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 1.484375, "step": 623, "time_per_iteration": 2.4165964126586914 }, { "auxiliary_loss_clip": 0.01193625, "auxiliary_loss_mlp": 0.01071854, "balance_loss_clip": 1.03711653, "balance_loss_mlp": 1.04731357, "epoch": 0.037516909664812866, "flos": 19207047409920.0, "grad_norm": 1.8172144217237507, "language_loss": 0.84119725, "learning_rate": 3.986208983736073e-06, "loss": 0.86385202, "num_input_tokens_seen": 13136355, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 1.4609375, "step": 624, "time_per_iteration": 2.416917562484741 }, { "auxiliary_loss_clip": 0.01191925, "auxiliary_loss_mlp": 0.01053709, "balance_loss_clip": 1.01808918, "balance_loss_mlp": 1.04193032, "epoch": 0.03757703291748084, "flos": 35003814526080.0, "grad_norm": 3.2844482048489367, "language_loss": 0.66283631, "learning_rate": 3.986164655224191e-06, "loss": 0.68529266, "num_input_tokens_seen": 13155435, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 1.5, "step": 625, "time_per_iteration": 2.5336241722106934 }, { "auxiliary_loss_clip": 0.01185041, "auxiliary_loss_mlp": 0.01067365, "balance_loss_clip": 1.03231764, "balance_loss_mlp": 1.04417443, "epoch": 0.0376371561701488, "flos": 25482720274560.0, "grad_norm": 2.101032365646545, "language_loss": 0.7704007, "learning_rate": 3.986120255831202e-06, "loss": 0.79292476, "num_input_tokens_seen": 13174295, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 1.40625, "step": 626, "time_per_iteration": 2.439168930053711 }, { "auxiliary_loss_clip": 0.0118929, "auxiliary_loss_mlp": 0.01064691, "balance_loss_clip": 1.02992964, "balance_loss_mlp": 1.04543984, "epoch": 0.037697279422816775, "flos": 18184685800320.0, "grad_norm": 1.7948574546239324, "language_loss": 0.81407958, "learning_rate": 3.986075785558691e-06, "loss": 0.83661938, "num_input_tokens_seen": 13192500, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 1.4375, "step": 627, "time_per_iteration": 2.3719449043273926 }, { "auxiliary_loss_clip": 0.01192957, "auxiliary_loss_mlp": 0.01069615, "balance_loss_clip": 1.03344655, "balance_loss_mlp": 1.04659152, "epoch": 0.03775740267548475, "flos": 24643128965760.0, "grad_norm": 1.630872127286863, "language_loss": 0.88502806, "learning_rate": 3.986031244408243e-06, "loss": 0.90765381, "num_input_tokens_seen": 13213470, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 1.4609375, "step": 628, "time_per_iteration": 2.4616754055023193 }, { "auxiliary_loss_clip": 0.01187756, "auxiliary_loss_mlp": 0.01060824, "balance_loss_clip": 1.02670586, "balance_loss_mlp": 1.04001284, "epoch": 0.03781752592815271, "flos": 21287137703040.0, "grad_norm": 3.1648139815741545, "language_loss": 0.79559755, "learning_rate": 3.985986632381449e-06, "loss": 0.81808335, "num_input_tokens_seen": 13232365, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 1.4765625, "step": 629, "time_per_iteration": 2.4048831462860107 }, { "auxiliary_loss_clip": 0.0118929, "auxiliary_loss_mlp": 0.01059391, "balance_loss_clip": 1.02613187, "balance_loss_mlp": 1.04272497, "epoch": 0.037877649180820684, "flos": 22088569029120.0, "grad_norm": 4.483336565305342, "language_loss": 0.76847458, "learning_rate": 3.9859419494799e-06, "loss": 0.79096138, "num_input_tokens_seen": 13251920, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 1.46875, "step": 630, "time_per_iteration": 2.4224445819854736 }, { "auxiliary_loss_clip": 0.0119296, "auxiliary_loss_mlp": 0.01068971, "balance_loss_clip": 1.03406703, "balance_loss_mlp": 1.04508269, "epoch": 0.03793777243348865, "flos": 14500476046080.0, "grad_norm": 3.327037065085722, "language_loss": 0.91509634, "learning_rate": 3.985897195705192e-06, "loss": 0.93771565, "num_input_tokens_seen": 13267440, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 1.484375, "step": 631, "time_per_iteration": 2.3660621643066406 }, { "auxiliary_loss_clip": 0.0118907, "auxiliary_loss_mlp": 0.01076706, "balance_loss_clip": 1.04106295, "balance_loss_mlp": 1.04433274, "epoch": 0.03799789568615662, "flos": 21907334828160.0, "grad_norm": 1.6012248644307439, "language_loss": 0.91935283, "learning_rate": 3.985852371058921e-06, "loss": 0.94201052, "num_input_tokens_seen": 13287850, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 1.4453125, "step": 632, "time_per_iteration": 2.4410040378570557 }, { "auxiliary_loss_clip": 0.01187046, "auxiliary_loss_mlp": 0.01058762, "balance_loss_clip": 1.02428651, "balance_loss_mlp": 1.04275036, "epoch": 0.03805801893882459, "flos": 24825864355200.0, "grad_norm": 1.8387130062056452, "language_loss": 0.83061844, "learning_rate": 3.985807475542687e-06, "loss": 0.85307658, "num_input_tokens_seen": 13307760, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 1.4453125, "step": 633, "time_per_iteration": 2.44438099861145 }, { "auxiliary_loss_clip": 0.0118841, "auxiliary_loss_mlp": 0.01060442, "balance_loss_clip": 1.02737331, "balance_loss_mlp": 1.04320788, "epoch": 0.03811814219149256, "flos": 30481619385600.0, "grad_norm": 1.6646039138205775, "language_loss": 0.69604558, "learning_rate": 3.985762509158093e-06, "loss": 0.71853411, "num_input_tokens_seen": 13331230, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 1.453125, "step": 634, "time_per_iteration": 2.516108989715576 }, { "auxiliary_loss_clip": 0.0107484, "auxiliary_loss_mlp": 0.01026803, "balance_loss_clip": 1.01931655, "balance_loss_mlp": 1.01821148, "epoch": 0.03817826544416053, "flos": 66989561639040.0, "grad_norm": 0.9040725694158229, "language_loss": 0.61635339, "learning_rate": 3.985717471906742e-06, "loss": 0.63736987, "num_input_tokens_seen": 13394760, "router_z_loss_clip": 0.07470703, "router_z_loss_mlp": 0.56640625, "step": 635, "time_per_iteration": 3.0856924057006836 }, { "auxiliary_loss_clip": 0.01185486, "auxiliary_loss_mlp": 0.01057715, "balance_loss_clip": 1.02440786, "balance_loss_mlp": 1.04052567, "epoch": 0.038238388696828496, "flos": 20484309922560.0, "grad_norm": 2.7305143207672726, "language_loss": 0.83529603, "learning_rate": 3.985672363790243e-06, "loss": 0.857728, "num_input_tokens_seen": 13412775, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 1.4453125, "step": 636, "time_per_iteration": 2.430511951446533 }, { "auxiliary_loss_clip": 0.01187786, "auxiliary_loss_mlp": 0.01063098, "balance_loss_clip": 1.02938521, "balance_loss_mlp": 1.04468215, "epoch": 0.03829851194949647, "flos": 17964977414400.0, "grad_norm": 2.82889058687413, "language_loss": 0.79160106, "learning_rate": 3.985627184810206e-06, "loss": 0.81410992, "num_input_tokens_seen": 13427835, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 1.4296875, "step": 637, "time_per_iteration": 2.394228219985962 }, { "auxiliary_loss_clip": 0.01189064, "auxiliary_loss_mlp": 0.01074372, "balance_loss_clip": 1.03841865, "balance_loss_mlp": 1.04261327, "epoch": 0.03835863520216444, "flos": 22455401351040.0, "grad_norm": 2.291592706612894, "language_loss": 0.83631814, "learning_rate": 3.985581934968241e-06, "loss": 0.85895246, "num_input_tokens_seen": 13447295, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 1.46875, "step": 638, "time_per_iteration": 2.4217000007629395 }, { "auxiliary_loss_clip": 0.01196848, "auxiliary_loss_mlp": 0.01064665, "balance_loss_clip": 1.02909291, "balance_loss_mlp": 1.04514432, "epoch": 0.038418758454832405, "flos": 22163317983360.0, "grad_norm": 3.242665113678473, "language_loss": 0.70392871, "learning_rate": 3.985536614265964e-06, "loss": 0.72654378, "num_input_tokens_seen": 13468455, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 1.515625, "step": 639, "time_per_iteration": 2.434626579284668 }, { "auxiliary_loss_clip": 0.01188056, "auxiliary_loss_mlp": 0.01069986, "balance_loss_clip": 1.03379369, "balance_loss_mlp": 1.04202294, "epoch": 0.03847888170750038, "flos": 22746332643840.0, "grad_norm": 5.606862574968034, "language_loss": 0.84624588, "learning_rate": 3.985491222704994e-06, "loss": 0.86882633, "num_input_tokens_seen": 13489085, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 1.4609375, "step": 640, "time_per_iteration": 2.431072235107422 }, { "auxiliary_loss_clip": 0.01191819, "auxiliary_loss_mlp": 0.01069098, "balance_loss_clip": 1.03369319, "balance_loss_mlp": 1.04466319, "epoch": 0.03853900496016834, "flos": 22710092785920.0, "grad_norm": 2.7125576891372547, "language_loss": 0.82238823, "learning_rate": 3.985445760286949e-06, "loss": 0.84499741, "num_input_tokens_seen": 13509120, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 1.4765625, "step": 641, "time_per_iteration": 2.419487714767456 }, { "auxiliary_loss_clip": 0.01068748, "auxiliary_loss_mlp": 0.01012281, "balance_loss_clip": 1.00498581, "balance_loss_mlp": 1.01523471, "epoch": 0.038599128212836314, "flos": 70395652569600.0, "grad_norm": 0.8889064780781849, "language_loss": 0.65465635, "learning_rate": 3.985400227013452e-06, "loss": 0.67546666, "num_input_tokens_seen": 13562005, "router_z_loss_clip": 0.07275391, "router_z_loss_mlp": 0.53515625, "step": 642, "time_per_iteration": 3.0247628688812256 }, { "auxiliary_loss_clip": 0.01191587, "auxiliary_loss_mlp": 0.01053832, "balance_loss_clip": 1.02100134, "balance_loss_mlp": 1.04353416, "epoch": 0.03865925146550429, "flos": 23294015141760.0, "grad_norm": 1.994719867029607, "language_loss": 0.79217535, "learning_rate": 3.985354622886128e-06, "loss": 0.81462955, "num_input_tokens_seen": 13582185, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 1.484375, "step": 643, "time_per_iteration": 2.430391788482666 }, { "auxiliary_loss_clip": 0.01188071, "auxiliary_loss_mlp": 0.01072142, "balance_loss_clip": 1.03945529, "balance_loss_mlp": 1.04187131, "epoch": 0.03871937471817225, "flos": 21429478782720.0, "grad_norm": 1.7826475929274195, "language_loss": 0.82554638, "learning_rate": 3.985308947906604e-06, "loss": 0.84814858, "num_input_tokens_seen": 13599555, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 1.46875, "step": 644, "time_per_iteration": 2.419905185699463 }, { "auxiliary_loss_clip": 0.01190707, "auxiliary_loss_mlp": 0.01065234, "balance_loss_clip": 1.02885127, "balance_loss_mlp": 1.04173517, "epoch": 0.038779497970840224, "flos": 34275875345280.0, "grad_norm": 2.5220844419508697, "language_loss": 0.82106018, "learning_rate": 3.985263202076511e-06, "loss": 0.84361959, "num_input_tokens_seen": 13621160, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 1.484375, "step": 645, "time_per_iteration": 3.919546604156494 }, { "auxiliary_loss_clip": 0.01194109, "auxiliary_loss_mlp": 0.01069637, "balance_loss_clip": 1.03518534, "balance_loss_mlp": 1.04296374, "epoch": 0.03883962122350819, "flos": 22747065782400.0, "grad_norm": 2.5742157379080894, "language_loss": 0.81492043, "learning_rate": 3.985217385397481e-06, "loss": 0.83755791, "num_input_tokens_seen": 13641915, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 1.515625, "step": 646, "time_per_iteration": 2.4276058673858643 }, { "auxiliary_loss_clip": 0.01192965, "auxiliary_loss_mlp": 0.01076988, "balance_loss_clip": 1.04070067, "balance_loss_mlp": 1.04868424, "epoch": 0.03889974447617616, "flos": 21944726760960.0, "grad_norm": 1.7673884490273624, "language_loss": 0.81530958, "learning_rate": 3.985171497871149e-06, "loss": 0.83800912, "num_input_tokens_seen": 13661410, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 1.4453125, "step": 647, "time_per_iteration": 2.4281909465789795 }, { "auxiliary_loss_clip": 0.01189059, "auxiliary_loss_mlp": 0.01066854, "balance_loss_clip": 1.03259313, "balance_loss_mlp": 1.04276097, "epoch": 0.03895986772884413, "flos": 31503457324800.0, "grad_norm": 1.9542556086114053, "language_loss": 0.8414427, "learning_rate": 3.985125539499152e-06, "loss": 0.86400187, "num_input_tokens_seen": 13681705, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 1.46875, "step": 648, "time_per_iteration": 5.353910207748413 }, { "auxiliary_loss_clip": 0.01189546, "auxiliary_loss_mlp": 0.01059021, "balance_loss_clip": 1.02609563, "balance_loss_mlp": 1.04522252, "epoch": 0.0390199909815121, "flos": 19900003541760.0, "grad_norm": 2.0317345177524047, "language_loss": 0.84429526, "learning_rate": 3.9850795102831315e-06, "loss": 0.86678088, "num_input_tokens_seen": 13700400, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 1.4375, "step": 649, "time_per_iteration": 2.4122958183288574 }, { "auxiliary_loss_clip": 0.0118993, "auxiliary_loss_mlp": 0.01065205, "balance_loss_clip": 1.02891779, "balance_loss_mlp": 1.04358125, "epoch": 0.03908011423418007, "flos": 21611515944960.0, "grad_norm": 1.8540114561548637, "language_loss": 0.79612905, "learning_rate": 3.9850334102247295e-06, "loss": 0.81868041, "num_input_tokens_seen": 13720145, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 1.4609375, "step": 650, "time_per_iteration": 2.431824207305908 }, { "auxiliary_loss_clip": 0.01184543, "auxiliary_loss_mlp": 0.01065113, "balance_loss_clip": 1.03082883, "balance_loss_mlp": 1.04022264, "epoch": 0.039140237486848035, "flos": 18660412252800.0, "grad_norm": 2.2190536894804413, "language_loss": 0.78213829, "learning_rate": 3.984987239325592e-06, "loss": 0.80463487, "num_input_tokens_seen": 13737500, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 1.4453125, "step": 651, "time_per_iteration": 3.855437994003296 }, { "auxiliary_loss_clip": 0.01187028, "auxiliary_loss_mlp": 0.01068847, "balance_loss_clip": 1.0326786, "balance_loss_mlp": 1.04069173, "epoch": 0.03920036073951601, "flos": 18660132961920.0, "grad_norm": 3.7514277523504167, "language_loss": 0.87278444, "learning_rate": 3.984940997587364e-06, "loss": 0.89534318, "num_input_tokens_seen": 13754750, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 1.4609375, "step": 652, "time_per_iteration": 2.42366099357605 }, { "auxiliary_loss_clip": 0.01177951, "auxiliary_loss_mlp": 0.01062097, "balance_loss_clip": 1.02807426, "balance_loss_mlp": 1.0395844, "epoch": 0.03926048399218398, "flos": 31353226277760.0, "grad_norm": 2.587133442199089, "language_loss": 0.79192305, "learning_rate": 3.984894685011699e-06, "loss": 0.8143236, "num_input_tokens_seen": 13771990, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 1.3828125, "step": 653, "time_per_iteration": 2.447777271270752 }, { "auxiliary_loss_clip": 0.01190905, "auxiliary_loss_mlp": 0.01072108, "balance_loss_clip": 1.03312695, "balance_loss_mlp": 1.04188657, "epoch": 0.039320607244851945, "flos": 29602297082880.0, "grad_norm": 2.3614461284920583, "language_loss": 0.8583045, "learning_rate": 3.984848301600248e-06, "loss": 0.8809346, "num_input_tokens_seen": 13792750, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 1.4921875, "step": 654, "time_per_iteration": 2.454594612121582 }, { "auxiliary_loss_clip": 0.01065384, "auxiliary_loss_mlp": 0.01015938, "balance_loss_clip": 1.00940573, "balance_loss_mlp": 1.01152527, "epoch": 0.03938073049751992, "flos": 66531151221120.0, "grad_norm": 0.7151695118935018, "language_loss": 0.49906549, "learning_rate": 3.984801847354667e-06, "loss": 0.51987869, "num_input_tokens_seen": 13858570, "router_z_loss_clip": 0.06542969, "router_z_loss_mlp": 0.5390625, "step": 655, "time_per_iteration": 3.1038155555725098 }, { "auxiliary_loss_clip": 0.01184583, "auxiliary_loss_mlp": 0.01064343, "balance_loss_clip": 1.03072572, "balance_loss_mlp": 1.0431056, "epoch": 0.03944085375018788, "flos": 23366704325760.0, "grad_norm": 2.3215592261136413, "language_loss": 0.80955482, "learning_rate": 3.984755322276614e-06, "loss": 0.83204401, "num_input_tokens_seen": 13876335, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 1.4140625, "step": 656, "time_per_iteration": 2.4134862422943115 }, { "auxiliary_loss_clip": 0.01196188, "auxiliary_loss_mlp": 0.01077883, "balance_loss_clip": 1.03923488, "balance_loss_mlp": 1.04710519, "epoch": 0.039500977002855854, "flos": 18547398581760.0, "grad_norm": 2.5540636579470912, "language_loss": 0.76357615, "learning_rate": 3.9847087263677485e-06, "loss": 0.78631687, "num_input_tokens_seen": 13892640, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 1.484375, "step": 657, "time_per_iteration": 2.357095718383789 }, { "auxiliary_loss_clip": 0.01192952, "auxiliary_loss_mlp": 0.01061956, "balance_loss_clip": 1.02576399, "balance_loss_mlp": 1.04402542, "epoch": 0.039561100255523826, "flos": 25336992792960.0, "grad_norm": 1.9004998410713654, "language_loss": 0.8134166, "learning_rate": 3.984662059629734e-06, "loss": 0.83596575, "num_input_tokens_seen": 13910085, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 1.4921875, "step": 658, "time_per_iteration": 2.44077205657959 }, { "auxiliary_loss_clip": 0.01183464, "auxiliary_loss_mlp": 0.01061993, "balance_loss_clip": 1.0252049, "balance_loss_mlp": 1.04198444, "epoch": 0.03962122350819179, "flos": 18219005533440.0, "grad_norm": 2.0504285700224885, "language_loss": 0.9085809, "learning_rate": 3.984615322064235e-06, "loss": 0.93103546, "num_input_tokens_seen": 13928800, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 1.4140625, "step": 659, "time_per_iteration": 2.4118030071258545 }, { "auxiliary_loss_clip": 0.01187671, "auxiliary_loss_mlp": 0.01062738, "balance_loss_clip": 1.027619, "balance_loss_mlp": 1.04179525, "epoch": 0.03968134676085976, "flos": 20521178184960.0, "grad_norm": 2.7338367910896078, "language_loss": 0.78944838, "learning_rate": 3.9845685136729215e-06, "loss": 0.81195241, "num_input_tokens_seen": 13948325, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 1.453125, "step": 660, "time_per_iteration": 2.4234118461608887 }, { "auxiliary_loss_clip": 0.01187967, "auxiliary_loss_mlp": 0.01062019, "balance_loss_clip": 1.02594614, "balance_loss_mlp": 1.04496956, "epoch": 0.03974147001352773, "flos": 22421395820160.0, "grad_norm": 1.6276707879309493, "language_loss": 0.81347334, "learning_rate": 3.984521634457461e-06, "loss": 0.8359732, "num_input_tokens_seen": 13969090, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 1.4296875, "step": 661, "time_per_iteration": 2.444849729537964 }, { "auxiliary_loss_clip": 0.01060966, "auxiliary_loss_mlp": 0.01010318, "balance_loss_clip": 1.00388038, "balance_loss_mlp": 1.00876069, "epoch": 0.0398015932661957, "flos": 71125267495680.0, "grad_norm": 0.9225560296975938, "language_loss": 0.69447446, "learning_rate": 3.98447468441953e-06, "loss": 0.71518731, "num_input_tokens_seen": 14037555, "router_z_loss_clip": 0.06445312, "router_z_loss_mlp": 0.5234375, "step": 662, "time_per_iteration": 3.17541766166687 }, { "auxiliary_loss_clip": 0.01189675, "auxiliary_loss_mlp": 0.01070304, "balance_loss_clip": 1.03501832, "balance_loss_mlp": 1.0454495, "epoch": 0.03986171651886367, "flos": 16799995434240.0, "grad_norm": 1.8352192519331945, "language_loss": 0.82945752, "learning_rate": 3.984427663560801e-06, "loss": 0.85205734, "num_input_tokens_seen": 14055765, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 1.4453125, "step": 663, "time_per_iteration": 2.4322595596313477 }, { "auxiliary_loss_clip": 0.01194363, "auxiliary_loss_mlp": 0.01058858, "balance_loss_clip": 1.02304792, "balance_loss_mlp": 1.04646635, "epoch": 0.03992183977153164, "flos": 24533920632960.0, "grad_norm": 2.3611688473755743, "language_loss": 0.87116724, "learning_rate": 3.984380571882954e-06, "loss": 0.89369941, "num_input_tokens_seen": 14074195, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 1.4765625, "step": 664, "time_per_iteration": 2.4412331581115723 }, { "auxiliary_loss_clip": 0.01183807, "auxiliary_loss_mlp": 0.01064496, "balance_loss_clip": 1.0302825, "balance_loss_mlp": 1.0417701, "epoch": 0.03998196302419961, "flos": 15595003169280.0, "grad_norm": 2.1207161045014273, "language_loss": 0.84756935, "learning_rate": 3.984333409387668e-06, "loss": 0.8700524, "num_input_tokens_seen": 14090215, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 1.421875, "step": 665, "time_per_iteration": 2.3960518836975098 }, { "auxiliary_loss_clip": 0.01195958, "auxiliary_loss_mlp": 0.01069138, "balance_loss_clip": 1.03170609, "balance_loss_mlp": 1.04581308, "epoch": 0.04004208627686758, "flos": 25303790223360.0, "grad_norm": 2.1601006873638107, "language_loss": 0.81672788, "learning_rate": 3.984286176076628e-06, "loss": 0.83937883, "num_input_tokens_seen": 14112150, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 1.5, "step": 666, "time_per_iteration": 2.4813365936279297 }, { "auxiliary_loss_clip": 0.01185139, "auxiliary_loss_mlp": 0.01059999, "balance_loss_clip": 1.02240062, "balance_loss_mlp": 1.04265499, "epoch": 0.04010220952953555, "flos": 23474760583680.0, "grad_norm": 1.9467012559641645, "language_loss": 0.86658657, "learning_rate": 3.984238871951518e-06, "loss": 0.88903797, "num_input_tokens_seen": 14131475, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 1.421875, "step": 667, "time_per_iteration": 2.4209389686584473 }, { "auxiliary_loss_clip": 0.01183022, "auxiliary_loss_mlp": 0.01060967, "balance_loss_clip": 1.02634835, "balance_loss_mlp": 1.04454565, "epoch": 0.04016233278220352, "flos": 18616247516160.0, "grad_norm": 2.0724305921822808, "language_loss": 0.80607831, "learning_rate": 3.984191497014026e-06, "loss": 0.82851821, "num_input_tokens_seen": 14146165, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 1.3828125, "step": 668, "time_per_iteration": 2.380335807800293 }, { "auxiliary_loss_clip": 0.01057149, "auxiliary_loss_mlp": 0.01008952, "balance_loss_clip": 1.00287223, "balance_loss_mlp": 1.00661552, "epoch": 0.040222456034871484, "flos": 70902801112320.0, "grad_norm": 0.7811409650925238, "language_loss": 0.6007818, "learning_rate": 3.984144051265844e-06, "loss": 0.62144279, "num_input_tokens_seen": 14215005, "router_z_loss_clip": 0.06079102, "router_z_loss_mlp": 0.5078125, "step": 669, "time_per_iteration": 3.1673338413238525 }, { "auxiliary_loss_clip": 0.01183658, "auxiliary_loss_mlp": 0.01063918, "balance_loss_clip": 1.02872753, "balance_loss_mlp": 1.04043889, "epoch": 0.040282579287539456, "flos": 23763701928960.0, "grad_norm": 1.7426617425744348, "language_loss": 0.86253875, "learning_rate": 3.984096534708665e-06, "loss": 0.88501447, "num_input_tokens_seen": 14235510, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 1.4296875, "step": 670, "time_per_iteration": 2.4509003162384033 }, { "auxiliary_loss_clip": 0.01184797, "auxiliary_loss_mlp": 0.01061892, "balance_loss_clip": 1.02713096, "balance_loss_mlp": 1.04237092, "epoch": 0.04034270254020743, "flos": 18477537217920.0, "grad_norm": 6.661894128042471, "language_loss": 0.74786806, "learning_rate": 3.9840489473441835e-06, "loss": 0.77033496, "num_input_tokens_seen": 14254565, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 1.4296875, "step": 671, "time_per_iteration": 2.4203484058380127 }, { "auxiliary_loss_clip": 0.01189964, "auxiliary_loss_mlp": 0.01067379, "balance_loss_clip": 1.03228378, "balance_loss_mlp": 1.04595947, "epoch": 0.040402825792875394, "flos": 17200903109760.0, "grad_norm": 1.921792455658059, "language_loss": 0.92102182, "learning_rate": 3.984001289174099e-06, "loss": 0.94359517, "num_input_tokens_seen": 14271885, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 1.4375, "step": 672, "time_per_iteration": 2.3875322341918945 }, { "auxiliary_loss_clip": 0.01188382, "auxiliary_loss_mlp": 0.01067851, "balance_loss_clip": 1.03127718, "balance_loss_mlp": 1.04510617, "epoch": 0.040462949045543366, "flos": 19171156665600.0, "grad_norm": 5.893165256633166, "language_loss": 0.90170169, "learning_rate": 3.983953560200113e-06, "loss": 0.92426401, "num_input_tokens_seen": 14289670, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 1.4375, "step": 673, "time_per_iteration": 2.4174492359161377 }, { "auxiliary_loss_clip": 0.01184043, "auxiliary_loss_mlp": 0.01070975, "balance_loss_clip": 1.03382957, "balance_loss_mlp": 1.04199457, "epoch": 0.04052307229821133, "flos": 24018812300160.0, "grad_norm": 1.8851288699257294, "language_loss": 0.74678195, "learning_rate": 3.983905760423926e-06, "loss": 0.76933217, "num_input_tokens_seen": 14309285, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 1.421875, "step": 674, "time_per_iteration": 2.4453983306884766 }, { "auxiliary_loss_clip": 0.01192146, "auxiliary_loss_mlp": 0.01056601, "balance_loss_clip": 1.01995611, "balance_loss_mlp": 1.043841, "epoch": 0.0405831955508793, "flos": 16435641818880.0, "grad_norm": 2.672433531864122, "language_loss": 0.77962393, "learning_rate": 3.983857889847247e-06, "loss": 0.80211139, "num_input_tokens_seen": 14328300, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 1.484375, "step": 675, "time_per_iteration": 2.4712469577789307 }, { "auxiliary_loss_clip": 0.01188736, "auxiliary_loss_mlp": 0.01067069, "balance_loss_clip": 1.03259361, "balance_loss_mlp": 1.04343069, "epoch": 0.040643318803547275, "flos": 24278775350400.0, "grad_norm": 1.777188058958025, "language_loss": 0.76703358, "learning_rate": 3.983809948471783e-06, "loss": 0.78959161, "num_input_tokens_seen": 14346395, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 1.453125, "step": 676, "time_per_iteration": 2.42793607711792 }, { "auxiliary_loss_clip": 0.01190334, "auxiliary_loss_mlp": 0.01066163, "balance_loss_clip": 1.03082967, "balance_loss_mlp": 1.04389369, "epoch": 0.04070344205621524, "flos": 17711123852160.0, "grad_norm": 2.550943853737293, "language_loss": 0.84916627, "learning_rate": 3.983761936299245e-06, "loss": 0.87173128, "num_input_tokens_seen": 14364605, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 1.4609375, "step": 677, "time_per_iteration": 2.4127068519592285 }, { "auxiliary_loss_clip": 0.01185319, "auxiliary_loss_mlp": 0.01062526, "balance_loss_clip": 1.02757335, "balance_loss_mlp": 1.04432964, "epoch": 0.04076356530888321, "flos": 26176444456320.0, "grad_norm": 1.9381617410757228, "language_loss": 0.76106936, "learning_rate": 3.983713853331345e-06, "loss": 0.78354776, "num_input_tokens_seen": 14385265, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 1.40625, "step": 678, "time_per_iteration": 2.5064332485198975 }, { "auxiliary_loss_clip": 0.01187, "auxiliary_loss_mlp": 0.01064472, "balance_loss_clip": 1.028018, "balance_loss_mlp": 1.04322159, "epoch": 0.04082368856155118, "flos": 35771973459840.0, "grad_norm": 1.9634592798462205, "language_loss": 0.82002586, "learning_rate": 3.9836656995698015e-06, "loss": 0.84254062, "num_input_tokens_seen": 14406090, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 1.4375, "step": 679, "time_per_iteration": 2.528810739517212 }, { "auxiliary_loss_clip": 0.01190761, "auxiliary_loss_mlp": 0.01058053, "balance_loss_clip": 1.02450764, "balance_loss_mlp": 1.04950869, "epoch": 0.04088381181421915, "flos": 28145406291840.0, "grad_norm": 3.685127405500079, "language_loss": 0.76211154, "learning_rate": 3.983617475016331e-06, "loss": 0.78459966, "num_input_tokens_seen": 14425130, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 1.4140625, "step": 680, "time_per_iteration": 2.4731523990631104 }, { "auxiliary_loss_clip": 0.01187026, "auxiliary_loss_mlp": 0.01062935, "balance_loss_clip": 1.02447796, "balance_loss_mlp": 1.03947425, "epoch": 0.04094393506688712, "flos": 27596501896320.0, "grad_norm": 1.9793853535666257, "language_loss": 0.83050603, "learning_rate": 3.9835691796726555e-06, "loss": 0.85300565, "num_input_tokens_seen": 14447355, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 1.4765625, "step": 681, "time_per_iteration": 2.4579238891601562 }, { "auxiliary_loss_clip": 0.01188714, "auxiliary_loss_mlp": 0.01065756, "balance_loss_clip": 1.02758527, "balance_loss_mlp": 1.04244184, "epoch": 0.04100405831955509, "flos": 23110930638720.0, "grad_norm": 1.850859883141676, "language_loss": 0.71165198, "learning_rate": 3.9835208135404986e-06, "loss": 0.73419666, "num_input_tokens_seen": 14466790, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 1.4609375, "step": 682, "time_per_iteration": 2.428924798965454 }, { "auxiliary_loss_clip": 0.01183112, "auxiliary_loss_mlp": 0.01065471, "balance_loss_clip": 1.0303278, "balance_loss_mlp": 1.04071558, "epoch": 0.04106418157222306, "flos": 20155707406080.0, "grad_norm": 1.6317417738527569, "language_loss": 0.72059846, "learning_rate": 3.9834723766215865e-06, "loss": 0.74308419, "num_input_tokens_seen": 14485195, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 1.421875, "step": 683, "time_per_iteration": 2.3904831409454346 }, { "auxiliary_loss_clip": 0.01184685, "auxiliary_loss_mlp": 0.0106614, "balance_loss_clip": 1.03223693, "balance_loss_mlp": 1.04592633, "epoch": 0.041124304824891024, "flos": 17419738711680.0, "grad_norm": 2.182818138980505, "language_loss": 0.81072485, "learning_rate": 3.983423868917646e-06, "loss": 0.83323312, "num_input_tokens_seen": 14503370, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 1.3828125, "step": 684, "time_per_iteration": 2.383819818496704 }, { "auxiliary_loss_clip": 0.01188177, "auxiliary_loss_mlp": 0.01062482, "balance_loss_clip": 1.02621865, "balance_loss_mlp": 1.04425693, "epoch": 0.041184428077558996, "flos": 25778853360000.0, "grad_norm": 1.6602237229884422, "language_loss": 0.9059425, "learning_rate": 3.983375290430411e-06, "loss": 0.92844909, "num_input_tokens_seen": 14526415, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 1.4375, "step": 685, "time_per_iteration": 3.942558526992798 }, { "auxiliary_loss_clip": 0.01185363, "auxiliary_loss_mlp": 0.01061075, "balance_loss_clip": 1.0242871, "balance_loss_mlp": 1.04252374, "epoch": 0.04124455133022697, "flos": 22963701968640.0, "grad_norm": 2.020637774355877, "language_loss": 0.88082665, "learning_rate": 3.983326641161613e-06, "loss": 0.90329105, "num_input_tokens_seen": 14546595, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 1.4296875, "step": 686, "time_per_iteration": 2.41579270362854 }, { "auxiliary_loss_clip": 0.01187174, "auxiliary_loss_mlp": 0.01067066, "balance_loss_clip": 1.02963471, "balance_loss_mlp": 1.04243541, "epoch": 0.04130467458289493, "flos": 21287975575680.0, "grad_norm": 1.8269890377548201, "language_loss": 0.71391737, "learning_rate": 3.9832779211129894e-06, "loss": 0.73645979, "num_input_tokens_seen": 14566590, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 1.453125, "step": 687, "time_per_iteration": 2.451077461242676 }, { "auxiliary_loss_clip": 0.01183612, "auxiliary_loss_mlp": 0.01060901, "balance_loss_clip": 1.02704561, "balance_loss_mlp": 1.046556, "epoch": 0.041364797835562905, "flos": 19973216396160.0, "grad_norm": 1.5390372221479989, "language_loss": 0.8611179, "learning_rate": 3.983229130286278e-06, "loss": 0.88356304, "num_input_tokens_seen": 14585965, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 1.3671875, "step": 688, "time_per_iteration": 5.2253546714782715 }, { "auxiliary_loss_clip": 0.01181434, "auxiliary_loss_mlp": 0.01070965, "balance_loss_clip": 1.03534508, "balance_loss_mlp": 1.04390609, "epoch": 0.04142492108823087, "flos": 21905205235200.0, "grad_norm": 1.8610843488901465, "language_loss": 0.83315575, "learning_rate": 3.98318026868322e-06, "loss": 0.85567975, "num_input_tokens_seen": 14606015, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 1.375, "step": 689, "time_per_iteration": 2.4173686504364014 }, { "auxiliary_loss_clip": 0.01183212, "auxiliary_loss_mlp": 0.01069871, "balance_loss_clip": 1.03622985, "balance_loss_mlp": 1.04125214, "epoch": 0.04148504434089884, "flos": 27638292660480.0, "grad_norm": 2.3858573184890948, "language_loss": 0.68026263, "learning_rate": 3.9831313363055606e-06, "loss": 0.70279348, "num_input_tokens_seen": 14629955, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 1.421875, "step": 690, "time_per_iteration": 3.858558416366577 }, { "auxiliary_loss_clip": 0.01178753, "auxiliary_loss_mlp": 0.01064627, "balance_loss_clip": 1.02952015, "balance_loss_mlp": 1.03993392, "epoch": 0.041545167593566815, "flos": 20517442669440.0, "grad_norm": 2.23165164324267, "language_loss": 0.74733639, "learning_rate": 3.9830823331550445e-06, "loss": 0.76977026, "num_input_tokens_seen": 14648000, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 1.390625, "step": 691, "time_per_iteration": 2.3846516609191895 }, { "auxiliary_loss_clip": 0.01179734, "auxiliary_loss_mlp": 0.01071205, "balance_loss_clip": 1.03522789, "balance_loss_mlp": 1.04067516, "epoch": 0.04160529084623478, "flos": 11868269512320.0, "grad_norm": 2.2544470318593404, "language_loss": 0.84076923, "learning_rate": 3.983033259233421e-06, "loss": 0.86327863, "num_input_tokens_seen": 14662235, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 1.390625, "step": 692, "time_per_iteration": 2.367363214492798 }, { "auxiliary_loss_clip": 0.01186203, "auxiliary_loss_mlp": 0.01064559, "balance_loss_clip": 1.02877247, "balance_loss_mlp": 1.04257679, "epoch": 0.04166541409890275, "flos": 14827472640000.0, "grad_norm": 2.7512087519687785, "language_loss": 0.88303667, "learning_rate": 3.982984114542442e-06, "loss": 0.90554428, "num_input_tokens_seen": 14676065, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 1.4375, "step": 693, "time_per_iteration": 2.3654260635375977 }, { "auxiliary_loss_clip": 0.01184472, "auxiliary_loss_mlp": 0.01060751, "balance_loss_clip": 1.02789724, "balance_loss_mlp": 1.04375386, "epoch": 0.04172553735157072, "flos": 25807063605120.0, "grad_norm": 2.1493026141193754, "language_loss": 0.81644607, "learning_rate": 3.98293489908386e-06, "loss": 0.8388983, "num_input_tokens_seen": 14694955, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 1.40625, "step": 694, "time_per_iteration": 2.4572255611419678 }, { "auxiliary_loss_clip": 0.01185629, "auxiliary_loss_mlp": 0.01062164, "balance_loss_clip": 1.02723527, "balance_loss_mlp": 1.04134369, "epoch": 0.04178566060423869, "flos": 24278670616320.0, "grad_norm": 1.9838667020409235, "language_loss": 0.8338263, "learning_rate": 3.982885612859432e-06, "loss": 0.85630423, "num_input_tokens_seen": 14715510, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 1.4375, "step": 695, "time_per_iteration": 2.423081398010254 }, { "auxiliary_loss_clip": 0.01187447, "auxiliary_loss_mlp": 0.0107011, "balance_loss_clip": 1.03351307, "balance_loss_mlp": 1.0442071, "epoch": 0.04184578385690666, "flos": 18221065303680.0, "grad_norm": 2.0993753783977223, "language_loss": 0.84214848, "learning_rate": 3.982836255870918e-06, "loss": 0.86472404, "num_input_tokens_seen": 14731755, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 1.4296875, "step": 696, "time_per_iteration": 2.3784396648406982 }, { "auxiliary_loss_clip": 0.01182217, "auxiliary_loss_mlp": 0.01071365, "balance_loss_clip": 1.03605509, "balance_loss_mlp": 1.04098535, "epoch": 0.041905907109574626, "flos": 22775450584320.0, "grad_norm": 2.124818166614912, "language_loss": 0.9306224, "learning_rate": 3.982786828120078e-06, "loss": 0.95315826, "num_input_tokens_seen": 14750810, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 1.4140625, "step": 697, "time_per_iteration": 2.40679669380188 }, { "auxiliary_loss_clip": 0.01178436, "auxiliary_loss_mlp": 0.01060785, "balance_loss_clip": 1.02650046, "balance_loss_mlp": 1.04041481, "epoch": 0.0419660303622426, "flos": 20155916874240.0, "grad_norm": 2.274826421908768, "language_loss": 0.8352983, "learning_rate": 3.982737329608676e-06, "loss": 0.85769051, "num_input_tokens_seen": 14768435, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 1.3828125, "step": 698, "time_per_iteration": 2.4244003295898438 }, { "auxiliary_loss_clip": 0.01183744, "auxiliary_loss_mlp": 0.01074651, "balance_loss_clip": 1.03903079, "balance_loss_mlp": 1.04241085, "epoch": 0.042026153614910564, "flos": 23075249362560.0, "grad_norm": 2.40989884291235, "language_loss": 0.91279924, "learning_rate": 3.98268776033848e-06, "loss": 0.9353832, "num_input_tokens_seen": 14786690, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 1.4140625, "step": 699, "time_per_iteration": 2.4093446731567383 }, { "auxiliary_loss_clip": 0.0106356, "auxiliary_loss_mlp": 0.01011575, "balance_loss_clip": 1.00621021, "balance_loss_mlp": 1.01348925, "epoch": 0.042086276867578536, "flos": 64491734528640.0, "grad_norm": 0.8829531456366362, "language_loss": 0.67870784, "learning_rate": 3.9826381203112575e-06, "loss": 0.6994592, "num_input_tokens_seen": 14853840, "router_z_loss_clip": 0.05371094, "router_z_loss_mlp": 0.5, "step": 700, "time_per_iteration": 3.081566333770752 }, { "auxiliary_loss_clip": 0.01188067, "auxiliary_loss_mlp": 0.01067494, "balance_loss_clip": 1.02786875, "balance_loss_mlp": 1.04447913, "epoch": 0.04214640012024651, "flos": 15486109038720.0, "grad_norm": 2.4898711189618576, "language_loss": 0.88516855, "learning_rate": 3.98258840952878e-06, "loss": 0.90772414, "num_input_tokens_seen": 14869580, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 1.4375, "step": 701, "time_per_iteration": 2.3736653327941895 }, { "auxiliary_loss_clip": 0.01184988, "auxiliary_loss_mlp": 0.0107199, "balance_loss_clip": 1.03656125, "balance_loss_mlp": 1.04638183, "epoch": 0.04220652337291447, "flos": 23875947550080.0, "grad_norm": 1.7072433387743238, "language_loss": 0.67324317, "learning_rate": 3.982538627992822e-06, "loss": 0.69581294, "num_input_tokens_seen": 14891065, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 1.390625, "step": 702, "time_per_iteration": 2.4374215602874756 }, { "auxiliary_loss_clip": 0.0105983, "auxiliary_loss_mlp": 0.01005343, "balance_loss_clip": 0.99969298, "balance_loss_mlp": 1.00805283, "epoch": 0.042266646625582445, "flos": 63792145238400.0, "grad_norm": 0.83391300942417, "language_loss": 0.60691524, "learning_rate": 3.98248877570516e-06, "loss": 0.62756693, "num_input_tokens_seen": 14954815, "router_z_loss_clip": 0.05639648, "router_z_loss_mlp": 0.515625, "step": 703, "time_per_iteration": 3.142807722091675 }, { "auxiliary_loss_clip": 0.01056487, "auxiliary_loss_mlp": 0.01007253, "balance_loss_clip": 1.00181758, "balance_loss_mlp": 1.00678504, "epoch": 0.04232676987825041, "flos": 50015521877760.0, "grad_norm": 1.0100126866570873, "language_loss": 0.57689762, "learning_rate": 3.982438852667574e-06, "loss": 0.59753501, "num_input_tokens_seen": 15003050, "router_z_loss_clip": 0.05444336, "router_z_loss_mlp": 0.49609375, "step": 704, "time_per_iteration": 2.9380300045013428 }, { "auxiliary_loss_clip": 0.01187186, "auxiliary_loss_mlp": 0.01060862, "balance_loss_clip": 1.02455127, "balance_loss_mlp": 1.04617953, "epoch": 0.04238689313091838, "flos": 21615041992320.0, "grad_norm": 2.3619916673472745, "language_loss": 0.87409616, "learning_rate": 3.982388858881844e-06, "loss": 0.89657664, "num_input_tokens_seen": 15021990, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 1.40625, "step": 705, "time_per_iteration": 2.4390175342559814 }, { "auxiliary_loss_clip": 0.01174888, "auxiliary_loss_mlp": 0.01060965, "balance_loss_clip": 1.02885032, "balance_loss_mlp": 1.04063582, "epoch": 0.042447016383586354, "flos": 19134113846400.0, "grad_norm": 1.8293907438004477, "language_loss": 0.71343666, "learning_rate": 3.982338794349755e-06, "loss": 0.7357952, "num_input_tokens_seen": 15040700, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 1.34375, "step": 706, "time_per_iteration": 2.386361598968506 }, { "auxiliary_loss_clip": 0.01177439, "auxiliary_loss_mlp": 0.01066396, "balance_loss_clip": 1.02932167, "balance_loss_mlp": 1.04061365, "epoch": 0.04250713963625432, "flos": 24424851945600.0, "grad_norm": 2.0036013581762693, "language_loss": 0.93354023, "learning_rate": 3.982288659073094e-06, "loss": 0.95597857, "num_input_tokens_seen": 15056725, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 1.3671875, "step": 707, "time_per_iteration": 2.44316029548645 }, { "auxiliary_loss_clip": 0.01181426, "auxiliary_loss_mlp": 0.01070727, "balance_loss_clip": 1.03467846, "balance_loss_mlp": 1.04033101, "epoch": 0.04256726288892229, "flos": 30366231742080.0, "grad_norm": 2.361387935515631, "language_loss": 0.8126626, "learning_rate": 3.98223845305365e-06, "loss": 0.8351841, "num_input_tokens_seen": 15077550, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 1.40625, "step": 708, "time_per_iteration": 2.48256254196167 }, { "auxiliary_loss_clip": 0.01187485, "auxiliary_loss_mlp": 0.01067734, "balance_loss_clip": 1.03063631, "balance_loss_mlp": 1.04343319, "epoch": 0.04262738614159026, "flos": 16361730737280.0, "grad_norm": 2.66574885580616, "language_loss": 0.81993365, "learning_rate": 3.982188176293213e-06, "loss": 0.84248579, "num_input_tokens_seen": 15094955, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 1.4375, "step": 709, "time_per_iteration": 2.407764434814453 }, { "auxiliary_loss_clip": 0.0118863, "auxiliary_loss_mlp": 0.0105774, "balance_loss_clip": 1.0218581, "balance_loss_mlp": 1.04400229, "epoch": 0.04268750939425823, "flos": 20411341447680.0, "grad_norm": 2.3436319116749598, "language_loss": 0.84847897, "learning_rate": 3.982137828793581e-06, "loss": 0.87094259, "num_input_tokens_seen": 15113395, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 1.4453125, "step": 710, "time_per_iteration": 2.4123568534851074 }, { "auxiliary_loss_clip": 0.0118838, "auxiliary_loss_mlp": 0.01067121, "balance_loss_clip": 1.03183496, "balance_loss_mlp": 1.04673469, "epoch": 0.0427476326469262, "flos": 20301923646720.0, "grad_norm": 2.682782371000687, "language_loss": 0.84520423, "learning_rate": 3.982087410556547e-06, "loss": 0.86775929, "num_input_tokens_seen": 15132920, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 1.421875, "step": 711, "time_per_iteration": 2.4092376232147217 }, { "auxiliary_loss_clip": 0.01179871, "auxiliary_loss_mlp": 0.01063639, "balance_loss_clip": 1.02687526, "balance_loss_mlp": 1.04195905, "epoch": 0.042807755899594166, "flos": 21649780661760.0, "grad_norm": 1.7518729085008558, "language_loss": 0.85324287, "learning_rate": 3.982036921583912e-06, "loss": 0.875678, "num_input_tokens_seen": 15153115, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 1.375, "step": 712, "time_per_iteration": 2.4427237510681152 }, { "auxiliary_loss_clip": 0.01187882, "auxiliary_loss_mlp": 0.01059618, "balance_loss_clip": 1.02557182, "balance_loss_mlp": 1.04309583, "epoch": 0.04286787915226214, "flos": 21433912525440.0, "grad_norm": 3.1329119544886876, "language_loss": 0.91045451, "learning_rate": 3.981986361877479e-06, "loss": 0.93292952, "num_input_tokens_seen": 15172770, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 1.453125, "step": 713, "time_per_iteration": 2.4084465503692627 }, { "auxiliary_loss_clip": 0.01058632, "auxiliary_loss_mlp": 0.01018882, "balance_loss_clip": 1.01318336, "balance_loss_mlp": 1.00909543, "epoch": 0.04292800240493011, "flos": 66394256313600.0, "grad_norm": 0.8888646137103391, "language_loss": 0.63704062, "learning_rate": 3.9819357314390494e-06, "loss": 0.65781581, "num_input_tokens_seen": 15240055, "router_z_loss_clip": 0.05688477, "router_z_loss_mlp": 0.49609375, "step": 714, "time_per_iteration": 3.1690354347229004 }, { "auxiliary_loss_clip": 0.01181426, "auxiliary_loss_mlp": 0.01074384, "balance_loss_clip": 1.04050469, "balance_loss_mlp": 1.04453063, "epoch": 0.042988125657598075, "flos": 31648905515520.0, "grad_norm": 2.1541672162311065, "language_loss": 0.74600798, "learning_rate": 3.981885030270432e-06, "loss": 0.76856601, "num_input_tokens_seen": 15261585, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 1.375, "step": 715, "time_per_iteration": 2.5548620223999023 }, { "auxiliary_loss_clip": 0.01186135, "auxiliary_loss_mlp": 0.01066759, "balance_loss_clip": 1.02880299, "balance_loss_mlp": 1.04575384, "epoch": 0.04304824891026605, "flos": 33247264602240.0, "grad_norm": 1.8331696717597785, "language_loss": 0.72439748, "learning_rate": 3.981834258373437e-06, "loss": 0.74692643, "num_input_tokens_seen": 15281160, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 1.40625, "step": 716, "time_per_iteration": 2.5122344493865967 }, { "auxiliary_loss_clip": 0.01051827, "auxiliary_loss_mlp": 0.01005289, "balance_loss_clip": 0.99956697, "balance_loss_mlp": 1.00391102, "epoch": 0.04310837216293401, "flos": 64061080508160.0, "grad_norm": 0.9005979170358152, "language_loss": 0.65497255, "learning_rate": 3.981783415749874e-06, "loss": 0.67554367, "num_input_tokens_seen": 15344505, "router_z_loss_clip": 0.05712891, "router_z_loss_mlp": 0.48046875, "step": 717, "time_per_iteration": 3.0917444229125977 }, { "auxiliary_loss_clip": 0.01050883, "auxiliary_loss_mlp": 0.01005029, "balance_loss_clip": 0.99964118, "balance_loss_mlp": 1.00407958, "epoch": 0.043168495415601985, "flos": 61340719057920.0, "grad_norm": 0.9752943296857631, "language_loss": 0.58790207, "learning_rate": 3.9817325024015596e-06, "loss": 0.6084612, "num_input_tokens_seen": 15404050, "router_z_loss_clip": 0.05395508, "router_z_loss_mlp": 0.46875, "step": 718, "time_per_iteration": 2.9039950370788574 }, { "auxiliary_loss_clip": 0.01183786, "auxiliary_loss_mlp": 0.01068357, "balance_loss_clip": 1.03123498, "balance_loss_mlp": 1.04652083, "epoch": 0.04322861866826996, "flos": 20703215347200.0, "grad_norm": 1.9132710050399087, "language_loss": 0.91328299, "learning_rate": 3.9816815183303086e-06, "loss": 0.93580437, "num_input_tokens_seen": 15424190, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 1.375, "step": 719, "time_per_iteration": 2.425180435180664 }, { "auxiliary_loss_clip": 0.01181178, "auxiliary_loss_mlp": 0.01071679, "balance_loss_clip": 1.03806162, "balance_loss_mlp": 1.04153848, "epoch": 0.04328874192093792, "flos": 30372027027840.0, "grad_norm": 1.6259532171414055, "language_loss": 0.66515422, "learning_rate": 3.981630463537942e-06, "loss": 0.68768275, "num_input_tokens_seen": 15446500, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 1.3984375, "step": 720, "time_per_iteration": 2.474658250808716 }, { "auxiliary_loss_clip": 0.01180216, "auxiliary_loss_mlp": 0.01068949, "balance_loss_clip": 1.03428292, "balance_loss_mlp": 1.04478395, "epoch": 0.043348865173605894, "flos": 21943714331520.0, "grad_norm": 2.3921306796946364, "language_loss": 0.77202111, "learning_rate": 3.981579338026282e-06, "loss": 0.79451281, "num_input_tokens_seen": 15465830, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 1.359375, "step": 721, "time_per_iteration": 2.426039457321167 }, { "auxiliary_loss_clip": 0.01183751, "auxiliary_loss_mlp": 0.01077762, "balance_loss_clip": 1.04156971, "balance_loss_mlp": 1.04415536, "epoch": 0.04340898842627386, "flos": 15263433187200.0, "grad_norm": 2.822492763484581, "language_loss": 0.88540536, "learning_rate": 3.981528141797153e-06, "loss": 0.9080205, "num_input_tokens_seen": 15479985, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 1.390625, "step": 722, "time_per_iteration": 2.366525888442993 }, { "auxiliary_loss_clip": 0.01190227, "auxiliary_loss_mlp": 0.01067458, "balance_loss_clip": 1.03372216, "balance_loss_mlp": 1.04493773, "epoch": 0.04346911167894183, "flos": 27964172090880.0, "grad_norm": 1.9351530887289412, "language_loss": 0.84070444, "learning_rate": 3.981476874852382e-06, "loss": 0.86328125, "num_input_tokens_seen": 15501545, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 1.453125, "step": 723, "time_per_iteration": 2.4779179096221924 }, { "auxiliary_loss_clip": 0.01187966, "auxiliary_loss_mlp": 0.01072437, "balance_loss_clip": 1.03755665, "balance_loss_mlp": 1.04717469, "epoch": 0.0435292349316098, "flos": 29240910933120.0, "grad_norm": 1.9139788895422787, "language_loss": 0.82327592, "learning_rate": 3.981425537193796e-06, "loss": 0.84587997, "num_input_tokens_seen": 15521725, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 1.40625, "step": 724, "time_per_iteration": 3.8979430198669434 }, { "auxiliary_loss_clip": 0.01182732, "auxiliary_loss_mlp": 0.01068706, "balance_loss_clip": 1.03420663, "balance_loss_mlp": 1.04488754, "epoch": 0.04358935818427777, "flos": 20557313308800.0, "grad_norm": 1.8369900418297336, "language_loss": 0.79121196, "learning_rate": 3.981374128823232e-06, "loss": 0.81372637, "num_input_tokens_seen": 15540910, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 1.3828125, "step": 725, "time_per_iteration": 2.4161336421966553 }, { "auxiliary_loss_clip": 0.01194352, "auxiliary_loss_mlp": 0.01072789, "balance_loss_clip": 1.03611982, "balance_loss_mlp": 1.04817867, "epoch": 0.04364948143694574, "flos": 14464061631360.0, "grad_norm": 2.12562964942191, "language_loss": 0.86453843, "learning_rate": 3.981322649742521e-06, "loss": 0.88720989, "num_input_tokens_seen": 15558640, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 1.4609375, "step": 726, "time_per_iteration": 2.3939080238342285 }, { "auxiliary_loss_clip": 0.01053085, "auxiliary_loss_mlp": 0.01004662, "balance_loss_clip": 0.99972701, "balance_loss_mlp": 1.00666237, "epoch": 0.043709604689613706, "flos": 50064610982400.0, "grad_norm": 0.9096007450487622, "language_loss": 0.55918157, "learning_rate": 3.9812710999535005e-06, "loss": 0.579759, "num_input_tokens_seen": 15612975, "router_z_loss_clip": 0.04931641, "router_z_loss_mlp": 0.46484375, "step": 727, "time_per_iteration": 4.51263689994812 }, { "auxiliary_loss_clip": 0.01187576, "auxiliary_loss_mlp": 0.01065319, "balance_loss_clip": 1.02750552, "balance_loss_mlp": 1.04635787, "epoch": 0.04376972794228168, "flos": 13990709151360.0, "grad_norm": 1.8794382856077294, "language_loss": 0.81984973, "learning_rate": 3.981219479458012e-06, "loss": 0.84237874, "num_input_tokens_seen": 15631070, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 1.4140625, "step": 728, "time_per_iteration": 3.7680346965789795 }, { "auxiliary_loss_clip": 0.01179165, "auxiliary_loss_mlp": 0.01065663, "balance_loss_clip": 1.03216577, "balance_loss_mlp": 1.04445708, "epoch": 0.04382985119494965, "flos": 22009037218560.0, "grad_norm": 2.383410342674767, "language_loss": 0.76899624, "learning_rate": 3.981167788257896e-06, "loss": 0.79144454, "num_input_tokens_seen": 15647825, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 1.34375, "step": 729, "time_per_iteration": 3.801180124282837 }, { "auxiliary_loss_clip": 0.01184388, "auxiliary_loss_mlp": 0.01062557, "balance_loss_clip": 1.02722335, "balance_loss_mlp": 1.04415679, "epoch": 0.043889974447617615, "flos": 24205387939200.0, "grad_norm": 2.001423814994023, "language_loss": 0.9496327, "learning_rate": 3.9811160263549985e-06, "loss": 0.97210211, "num_input_tokens_seen": 15668260, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 1.40625, "step": 730, "time_per_iteration": 2.4248015880584717 }, { "auxiliary_loss_clip": 0.01182056, "auxiliary_loss_mlp": 0.0106814, "balance_loss_clip": 1.03128076, "balance_loss_mlp": 1.04220426, "epoch": 0.04395009770028559, "flos": 17273592293760.0, "grad_norm": 2.3730318760653777, "language_loss": 0.8861438, "learning_rate": 3.981064193751166e-06, "loss": 0.90864581, "num_input_tokens_seen": 15685630, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 1.3984375, "step": 731, "time_per_iteration": 2.3986494541168213 }, { "auxiliary_loss_clip": 0.01182096, "auxiliary_loss_mlp": 0.01063611, "balance_loss_clip": 1.02999365, "balance_loss_mlp": 1.04369128, "epoch": 0.04401022095295355, "flos": 12309536586240.0, "grad_norm": 2.8991978654045716, "language_loss": 0.88705492, "learning_rate": 3.981012290448247e-06, "loss": 0.90951192, "num_input_tokens_seen": 15698645, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 1.3828125, "step": 732, "time_per_iteration": 2.382936716079712 }, { "auxiliary_loss_clip": 0.0118338, "auxiliary_loss_mlp": 0.01062163, "balance_loss_clip": 1.02690125, "balance_loss_mlp": 1.04321599, "epoch": 0.044070344205621524, "flos": 20958605009280.0, "grad_norm": 2.0845642292686395, "language_loss": 0.86170357, "learning_rate": 3.980960316448097e-06, "loss": 0.88415903, "num_input_tokens_seen": 15716775, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 1.40625, "step": 733, "time_per_iteration": 2.4018895626068115 }, { "auxiliary_loss_clip": 0.01187338, "auxiliary_loss_mlp": 0.01066768, "balance_loss_clip": 1.03024256, "balance_loss_mlp": 1.0461601, "epoch": 0.044130467458289496, "flos": 13844423088000.0, "grad_norm": 4.290017182560329, "language_loss": 0.90916038, "learning_rate": 3.980908271752567e-06, "loss": 0.93170148, "num_input_tokens_seen": 15733320, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 1.4140625, "step": 734, "time_per_iteration": 2.3619699478149414 }, { "auxiliary_loss_clip": 0.01180765, "auxiliary_loss_mlp": 0.01060107, "balance_loss_clip": 1.02715731, "balance_loss_mlp": 1.04557741, "epoch": 0.04419059071095746, "flos": 28653881466240.0, "grad_norm": 1.915025457554586, "language_loss": 0.77842975, "learning_rate": 3.980856156363518e-06, "loss": 0.80083847, "num_input_tokens_seen": 15752705, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 1.3515625, "step": 735, "time_per_iteration": 2.490703582763672 }, { "auxiliary_loss_clip": 0.01177451, "auxiliary_loss_mlp": 0.01060491, "balance_loss_clip": 1.0279355, "balance_loss_mlp": 1.04102802, "epoch": 0.04425071396362543, "flos": 28182065086080.0, "grad_norm": 2.359563242556638, "language_loss": 0.88532102, "learning_rate": 3.980803970282806e-06, "loss": 0.90770042, "num_input_tokens_seen": 15772800, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 1.3671875, "step": 736, "time_per_iteration": 2.4679150581359863 }, { "auxiliary_loss_clip": 0.01180427, "auxiliary_loss_mlp": 0.01066654, "balance_loss_clip": 1.03225017, "balance_loss_mlp": 1.046525, "epoch": 0.0443108372162934, "flos": 23657356327680.0, "grad_norm": 1.934699423655556, "language_loss": 0.84254616, "learning_rate": 3.980751713512298e-06, "loss": 0.86501706, "num_input_tokens_seen": 15793665, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 1.34375, "step": 737, "time_per_iteration": 2.4656014442443848 }, { "auxiliary_loss_clip": 0.01185789, "auxiliary_loss_mlp": 0.01069531, "balance_loss_clip": 1.03195643, "balance_loss_mlp": 1.04592919, "epoch": 0.04437096046896137, "flos": 33978590184960.0, "grad_norm": 1.8838707245702677, "language_loss": 0.84660316, "learning_rate": 3.980699386053855e-06, "loss": 0.86915642, "num_input_tokens_seen": 15813175, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 1.3984375, "step": 738, "time_per_iteration": 2.5087547302246094 }, { "auxiliary_loss_clip": 0.01054058, "auxiliary_loss_mlp": 0.0100866, "balance_loss_clip": 1.00396276, "balance_loss_mlp": 1.00775146, "epoch": 0.04443108372162934, "flos": 67394379386880.0, "grad_norm": 0.8607474266972598, "language_loss": 0.59154689, "learning_rate": 3.9806469879093465e-06, "loss": 0.61217415, "num_input_tokens_seen": 15872050, "router_z_loss_clip": 0.046875, "router_z_loss_mlp": 0.46289062, "step": 739, "time_per_iteration": 3.008528470993042 }, { "auxiliary_loss_clip": 0.01178647, "auxiliary_loss_mlp": 0.01063742, "balance_loss_clip": 1.03043461, "balance_loss_mlp": 1.0452528, "epoch": 0.04449120697429731, "flos": 29751376055040.0, "grad_norm": 2.074928094832132, "language_loss": 0.90996939, "learning_rate": 3.9805945190806415e-06, "loss": 0.93239331, "num_input_tokens_seen": 15891085, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 1.3359375, "step": 740, "time_per_iteration": 2.461949348449707 }, { "auxiliary_loss_clip": 0.01184099, "auxiliary_loss_mlp": 0.01063316, "balance_loss_clip": 1.02836418, "balance_loss_mlp": 1.04532051, "epoch": 0.04455133022696528, "flos": 36500645779200.0, "grad_norm": 1.9530878257015465, "language_loss": 0.71967971, "learning_rate": 3.980541979569614e-06, "loss": 0.74215388, "num_input_tokens_seen": 15914225, "router_z_loss_clip": 0.34960938, "router_z_loss_mlp": 1.390625, "step": 741, "time_per_iteration": 2.5428102016448975 }, { "auxiliary_loss_clip": 0.01177469, "auxiliary_loss_mlp": 0.01066288, "balance_loss_clip": 1.03188419, "balance_loss_mlp": 1.04174197, "epoch": 0.044611453479633245, "flos": 28802401856640.0, "grad_norm": 1.9194179518673538, "language_loss": 0.88805389, "learning_rate": 3.980489369378136e-06, "loss": 0.91049147, "num_input_tokens_seen": 15934540, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 1.359375, "step": 742, "time_per_iteration": 2.461562395095825 }, { "auxiliary_loss_clip": 0.01177628, "auxiliary_loss_mlp": 0.01059539, "balance_loss_clip": 1.0233475, "balance_loss_mlp": 1.04239082, "epoch": 0.04467157673230122, "flos": 20009945013120.0, "grad_norm": 1.8610295912199888, "language_loss": 0.83681965, "learning_rate": 3.980436688508087e-06, "loss": 0.8591913, "num_input_tokens_seen": 15952560, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 1.3515625, "step": 743, "time_per_iteration": 2.410418748855591 }, { "auxiliary_loss_clip": 0.01180885, "auxiliary_loss_mlp": 0.0106705, "balance_loss_clip": 1.031955, "balance_loss_mlp": 1.04375339, "epoch": 0.04473169998496919, "flos": 18003975269760.0, "grad_norm": 2.0392217011253617, "language_loss": 0.79766238, "learning_rate": 3.980383936961348e-06, "loss": 0.82014179, "num_input_tokens_seen": 15970620, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 1.375, "step": 744, "time_per_iteration": 2.3963050842285156 }, { "auxiliary_loss_clip": 0.01175583, "auxiliary_loss_mlp": 0.01060617, "balance_loss_clip": 1.02821624, "balance_loss_mlp": 1.04345059, "epoch": 0.044791823237637154, "flos": 20630665808640.0, "grad_norm": 2.062579014162858, "language_loss": 0.85017085, "learning_rate": 3.980331114739799e-06, "loss": 0.87253284, "num_input_tokens_seen": 15987325, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 1.3203125, "step": 745, "time_per_iteration": 2.41471791267395 }, { "auxiliary_loss_clip": 0.01179399, "auxiliary_loss_mlp": 0.01054924, "balance_loss_clip": 1.0201149, "balance_loss_mlp": 1.04224062, "epoch": 0.04485194649030513, "flos": 31174819896960.0, "grad_norm": 1.8628480270544208, "language_loss": 0.68768948, "learning_rate": 3.980278221845328e-06, "loss": 0.7100327, "num_input_tokens_seen": 16008310, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 1.375, "step": 746, "time_per_iteration": 2.485377788543701 }, { "auxiliary_loss_clip": 0.01184604, "auxiliary_loss_mlp": 0.01069791, "balance_loss_clip": 1.03326559, "balance_loss_mlp": 1.04763985, "epoch": 0.04491206974297309, "flos": 26142019989120.0, "grad_norm": 3.637756553710533, "language_loss": 0.68110108, "learning_rate": 3.98022525827982e-06, "loss": 0.70364505, "num_input_tokens_seen": 16029620, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 1.3671875, "step": 747, "time_per_iteration": 2.474304437637329 }, { "auxiliary_loss_clip": 0.01187419, "auxiliary_loss_mlp": 0.01082806, "balance_loss_clip": 1.04728198, "balance_loss_mlp": 1.04606819, "epoch": 0.044972192995641064, "flos": 20666626375680.0, "grad_norm": 2.2752135269186105, "language_loss": 0.66599447, "learning_rate": 3.980172224045168e-06, "loss": 0.68869674, "num_input_tokens_seen": 16049065, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 1.4140625, "step": 748, "time_per_iteration": 2.415919303894043 }, { "auxiliary_loss_clip": 0.01183825, "auxiliary_loss_mlp": 0.0107098, "balance_loss_clip": 1.03421593, "balance_loss_mlp": 1.04668474, "epoch": 0.045032316248309036, "flos": 16105922138880.0, "grad_norm": 3.0247764736933203, "language_loss": 0.76647866, "learning_rate": 3.980119119143262e-06, "loss": 0.78902674, "num_input_tokens_seen": 16066765, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 1.3671875, "step": 749, "time_per_iteration": 2.461630344390869 }, { "auxiliary_loss_clip": 0.01184256, "auxiliary_loss_mlp": 0.01067013, "balance_loss_clip": 1.03275204, "balance_loss_mlp": 1.04713106, "epoch": 0.045092439500977, "flos": 17857863763200.0, "grad_norm": 1.9949543408587387, "language_loss": 0.88806438, "learning_rate": 3.980065943575998e-06, "loss": 0.91057712, "num_input_tokens_seen": 16085980, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 1.375, "step": 750, "time_per_iteration": 2.389357089996338 }, { "auxiliary_loss_clip": 0.01185133, "auxiliary_loss_mlp": 0.01076102, "balance_loss_clip": 1.03704882, "balance_loss_mlp": 1.0451926, "epoch": 0.04515256275364497, "flos": 24461650385280.0, "grad_norm": 4.792346322114513, "language_loss": 0.74504662, "learning_rate": 3.9800126973452725e-06, "loss": 0.76765895, "num_input_tokens_seen": 16106260, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 1.3984375, "step": 751, "time_per_iteration": 2.466661214828491 }, { "auxiliary_loss_clip": 0.01177467, "auxiliary_loss_mlp": 0.01066612, "balance_loss_clip": 1.02989578, "balance_loss_mlp": 1.04091656, "epoch": 0.04521268600631294, "flos": 20915522524800.0, "grad_norm": 1.8841999039596504, "language_loss": 0.68607342, "learning_rate": 3.979959380452989e-06, "loss": 0.70851421, "num_input_tokens_seen": 16123475, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 1.359375, "step": 752, "time_per_iteration": 2.3971285820007324 }, { "auxiliary_loss_clip": 0.01178181, "auxiliary_loss_mlp": 0.01054287, "balance_loss_clip": 1.01993108, "balance_loss_mlp": 1.04174387, "epoch": 0.04527280925898091, "flos": 13370512026240.0, "grad_norm": 2.5533987417203603, "language_loss": 0.9229058, "learning_rate": 3.979905992901047e-06, "loss": 0.94523054, "num_input_tokens_seen": 16138335, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 1.359375, "step": 753, "time_per_iteration": 2.3955495357513428 }, { "auxiliary_loss_clip": 0.01183752, "auxiliary_loss_mlp": 0.01066287, "balance_loss_clip": 1.03231251, "balance_loss_mlp": 1.04663646, "epoch": 0.04533293251164888, "flos": 23253551009280.0, "grad_norm": 1.9030511128020393, "language_loss": 0.91005522, "learning_rate": 3.979852534691353e-06, "loss": 0.93255562, "num_input_tokens_seen": 16157110, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 1.375, "step": 754, "time_per_iteration": 2.4177744388580322 }, { "auxiliary_loss_clip": 0.01172809, "auxiliary_loss_mlp": 0.01066015, "balance_loss_clip": 1.03161073, "balance_loss_mlp": 1.04385495, "epoch": 0.04539305576431685, "flos": 12421188714240.0, "grad_norm": 2.3406486534664896, "language_loss": 0.78643274, "learning_rate": 3.979799005825816e-06, "loss": 0.80882096, "num_input_tokens_seen": 16174155, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 1.2890625, "step": 755, "time_per_iteration": 2.388094902038574 }, { "auxiliary_loss_clip": 0.01182913, "auxiliary_loss_mlp": 0.01077339, "balance_loss_clip": 1.04102838, "balance_loss_mlp": 1.04511786, "epoch": 0.04545317901698482, "flos": 16070066305920.0, "grad_norm": 2.104218086446312, "language_loss": 0.78481936, "learning_rate": 3.979745406306345e-06, "loss": 0.80742186, "num_input_tokens_seen": 16192240, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 1.375, "step": 756, "time_per_iteration": 2.375627279281616 }, { "auxiliary_loss_clip": 0.01054118, "auxiliary_loss_mlp": 0.0100707, "balance_loss_clip": 1.00187278, "balance_loss_mlp": 1.00968564, "epoch": 0.045513302269652785, "flos": 66392475834240.0, "grad_norm": 0.8078967859892556, "language_loss": 0.62762362, "learning_rate": 3.979691736134852e-06, "loss": 0.6482355, "num_input_tokens_seen": 16255775, "router_z_loss_clip": 0.05200195, "router_z_loss_mlp": 0.4453125, "step": 757, "time_per_iteration": 3.08660626411438 }, { "auxiliary_loss_clip": 0.01180996, "auxiliary_loss_mlp": 0.01061264, "balance_loss_clip": 1.02705073, "balance_loss_mlp": 1.04657936, "epoch": 0.04557342552232076, "flos": 21470082560640.0, "grad_norm": 2.0456393741536685, "language_loss": 0.84172112, "learning_rate": 3.979637995313254e-06, "loss": 0.86414373, "num_input_tokens_seen": 16277015, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 1.34375, "step": 758, "time_per_iteration": 2.445692539215088 }, { "auxiliary_loss_clip": 0.01172712, "auxiliary_loss_mlp": 0.01062261, "balance_loss_clip": 1.02950215, "balance_loss_mlp": 1.03983974, "epoch": 0.04563354877498873, "flos": 23731546700160.0, "grad_norm": 1.9123058886883226, "language_loss": 0.88420147, "learning_rate": 3.979584183843468e-06, "loss": 0.90655118, "num_input_tokens_seen": 16296005, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 1.328125, "step": 759, "time_per_iteration": 2.4633443355560303 }, { "auxiliary_loss_clip": 0.01183593, "auxiliary_loss_mlp": 0.01063908, "balance_loss_clip": 1.02807355, "balance_loss_mlp": 1.04788387, "epoch": 0.045693672027656694, "flos": 25734653712000.0, "grad_norm": 2.305370252748593, "language_loss": 0.73975301, "learning_rate": 3.979530301727414e-06, "loss": 0.76222801, "num_input_tokens_seen": 16315300, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 1.359375, "step": 760, "time_per_iteration": 2.4366261959075928 }, { "auxiliary_loss_clip": 0.01179764, "auxiliary_loss_mlp": 0.01055823, "balance_loss_clip": 1.02196741, "balance_loss_mlp": 1.0474813, "epoch": 0.045753795280324666, "flos": 19718001290880.0, "grad_norm": 1.965163134473522, "language_loss": 0.82210457, "learning_rate": 3.979476348967016e-06, "loss": 0.84446049, "num_input_tokens_seen": 16333820, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 1.328125, "step": 761, "time_per_iteration": 2.4243874549865723 }, { "auxiliary_loss_clip": 0.01175688, "auxiliary_loss_mlp": 0.01062024, "balance_loss_clip": 1.02773905, "balance_loss_mlp": 1.04460287, "epoch": 0.04581391853299264, "flos": 23254737995520.0, "grad_norm": 1.669881972747545, "language_loss": 0.7976234, "learning_rate": 3.979422325564199e-06, "loss": 0.82000047, "num_input_tokens_seen": 16355290, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 1.3125, "step": 762, "time_per_iteration": 2.4524641036987305 }, { "auxiliary_loss_clip": 0.01048775, "auxiliary_loss_mlp": 0.01005676, "balance_loss_clip": 1.0008601, "balance_loss_mlp": 1.00495434, "epoch": 0.0458740417856606, "flos": 64227896317440.0, "grad_norm": 0.9980029867422425, "language_loss": 0.58720791, "learning_rate": 3.979368231520891e-06, "loss": 0.60775238, "num_input_tokens_seen": 16415995, "router_z_loss_clip": 0.0480957, "router_z_loss_mlp": 0.4375, "step": 763, "time_per_iteration": 3.0900869369506836 }, { "auxiliary_loss_clip": 0.01180184, "auxiliary_loss_mlp": 0.01071259, "balance_loss_clip": 1.03927493, "balance_loss_mlp": 1.04385817, "epoch": 0.045934165038328575, "flos": 20769271372800.0, "grad_norm": 2.0523051948717885, "language_loss": 0.87536454, "learning_rate": 3.979314066839022e-06, "loss": 0.89787894, "num_input_tokens_seen": 16433120, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 1.359375, "step": 764, "time_per_iteration": 3.8769683837890625 }, { "auxiliary_loss_clip": 0.01179282, "auxiliary_loss_mlp": 0.01075631, "balance_loss_clip": 1.03967762, "balance_loss_mlp": 1.04498005, "epoch": 0.04599428829099654, "flos": 30261596797440.0, "grad_norm": 2.4330211626417233, "language_loss": 0.85370469, "learning_rate": 3.979259831520526e-06, "loss": 0.87625384, "num_input_tokens_seen": 16453360, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 1.34375, "step": 765, "time_per_iteration": 2.490962266921997 }, { "auxiliary_loss_clip": 0.01181964, "auxiliary_loss_mlp": 0.01070433, "balance_loss_clip": 1.03176188, "balance_loss_mlp": 1.04626715, "epoch": 0.04605441154366451, "flos": 23037822518400.0, "grad_norm": 2.832774509546428, "language_loss": 0.88183564, "learning_rate": 3.979205525567337e-06, "loss": 0.90435958, "num_input_tokens_seen": 16471160, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 1.359375, "step": 766, "time_per_iteration": 2.4284579753875732 }, { "auxiliary_loss_clip": 0.01174969, "auxiliary_loss_mlp": 0.01072638, "balance_loss_clip": 1.03797174, "balance_loss_mlp": 1.04154038, "epoch": 0.046114534796332485, "flos": 22016333692800.0, "grad_norm": 13.411650425654186, "language_loss": 0.83985424, "learning_rate": 3.979151148981395e-06, "loss": 0.86233032, "num_input_tokens_seen": 16488940, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 1.3359375, "step": 767, "time_per_iteration": 3.95444393157959 }, { "auxiliary_loss_clip": 0.01178257, "auxiliary_loss_mlp": 0.01061193, "balance_loss_clip": 1.02736187, "balance_loss_mlp": 1.04422903, "epoch": 0.04617465804900045, "flos": 29861073146880.0, "grad_norm": 5.4049371492260905, "language_loss": 0.8675254, "learning_rate": 3.979096701764638e-06, "loss": 0.88991988, "num_input_tokens_seen": 16509505, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 1.34375, "step": 768, "time_per_iteration": 2.4905314445495605 }, { "auxiliary_loss_clip": 0.01176369, "auxiliary_loss_mlp": 0.01066782, "balance_loss_clip": 1.0337851, "balance_loss_mlp": 1.04094982, "epoch": 0.04623478130166842, "flos": 25628866692480.0, "grad_norm": 2.37342504986601, "language_loss": 0.75016659, "learning_rate": 3.979042183919012e-06, "loss": 0.77259809, "num_input_tokens_seen": 16528840, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 1.3515625, "step": 769, "time_per_iteration": 3.8787219524383545 }, { "auxiliary_loss_clip": 0.01175254, "auxiliary_loss_mlp": 0.0106277, "balance_loss_clip": 1.02986789, "balance_loss_mlp": 1.04421747, "epoch": 0.04629490455433639, "flos": 20448035153280.0, "grad_norm": 40.211569394658184, "language_loss": 0.8622731, "learning_rate": 3.97898759544646e-06, "loss": 0.88465333, "num_input_tokens_seen": 16548335, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 1.3046875, "step": 770, "time_per_iteration": 2.41145920753479 }, { "auxiliary_loss_clip": 0.01179355, "auxiliary_loss_mlp": 0.01064699, "balance_loss_clip": 1.03155899, "balance_loss_mlp": 1.0424161, "epoch": 0.04635502780700436, "flos": 23147624344320.0, "grad_norm": 2.290435336935505, "language_loss": 0.8721177, "learning_rate": 3.978932936348932e-06, "loss": 0.89455825, "num_input_tokens_seen": 16567725, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 1.3671875, "step": 771, "time_per_iteration": 2.4312503337860107 }, { "auxiliary_loss_clip": 0.01181348, "auxiliary_loss_mlp": 0.0107427, "balance_loss_clip": 1.03600359, "balance_loss_mlp": 1.04299688, "epoch": 0.04641515105967233, "flos": 23290977853440.0, "grad_norm": 2.206256766876312, "language_loss": 0.83575541, "learning_rate": 3.978878206628377e-06, "loss": 0.85831153, "num_input_tokens_seen": 16588175, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 1.375, "step": 772, "time_per_iteration": 2.4327445030212402 }, { "auxiliary_loss_clip": 0.01177164, "auxiliary_loss_mlp": 0.01059886, "balance_loss_clip": 1.02746129, "balance_loss_mlp": 1.04654682, "epoch": 0.046475274312340296, "flos": 25114142384640.0, "grad_norm": 1.9569844693108625, "language_loss": 0.73629689, "learning_rate": 3.978823406286751e-06, "loss": 0.75866747, "num_input_tokens_seen": 16607735, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 1.3046875, "step": 773, "time_per_iteration": 2.450957775115967 }, { "auxiliary_loss_clip": 0.0117497, "auxiliary_loss_mlp": 0.01059978, "balance_loss_clip": 1.02726662, "balance_loss_mlp": 1.0445869, "epoch": 0.04653539756500827, "flos": 25263745027200.0, "grad_norm": 2.052677547720233, "language_loss": 0.78662962, "learning_rate": 3.978768535326006e-06, "loss": 0.80897909, "num_input_tokens_seen": 16627225, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 1.3046875, "step": 774, "time_per_iteration": 2.43341326713562 }, { "auxiliary_loss_clip": 0.01171919, "auxiliary_loss_mlp": 0.01058004, "balance_loss_clip": 1.02596056, "balance_loss_mlp": 1.04186547, "epoch": 0.046595520817676234, "flos": 35402802076800.0, "grad_norm": 2.122867568169163, "language_loss": 0.73343658, "learning_rate": 3.978713593748103e-06, "loss": 0.75573587, "num_input_tokens_seen": 16647785, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 1.296875, "step": 775, "time_per_iteration": 2.53930926322937 }, { "auxiliary_loss_clip": 0.01176197, "auxiliary_loss_mlp": 0.01062381, "balance_loss_clip": 1.02788162, "balance_loss_mlp": 1.04260957, "epoch": 0.046655644070344206, "flos": 18111577680000.0, "grad_norm": 1.6785940587675907, "language_loss": 0.76859474, "learning_rate": 3.9786585815550015e-06, "loss": 0.79098046, "num_input_tokens_seen": 16667555, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 1.3359375, "step": 776, "time_per_iteration": 2.4250969886779785 }, { "auxiliary_loss_clip": 0.0116975, "auxiliary_loss_mlp": 0.01059828, "balance_loss_clip": 1.02811885, "balance_loss_mlp": 1.04102755, "epoch": 0.04671576732301218, "flos": 29204007759360.0, "grad_norm": 4.3993855916972695, "language_loss": 0.7100842, "learning_rate": 3.978603498748664e-06, "loss": 0.73238003, "num_input_tokens_seen": 16686875, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 1.28125, "step": 777, "time_per_iteration": 2.4765748977661133 }, { "auxiliary_loss_clip": 0.01172053, "auxiliary_loss_mlp": 0.01070464, "balance_loss_clip": 1.03491557, "balance_loss_mlp": 1.04171491, "epoch": 0.04677589057568014, "flos": 30477115820160.0, "grad_norm": 1.8863215391272792, "language_loss": 0.7640267, "learning_rate": 3.978548345331058e-06, "loss": 0.78645194, "num_input_tokens_seen": 16706420, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 1.296875, "step": 778, "time_per_iteration": 2.472606897354126 }, { "auxiliary_loss_clip": 0.01171305, "auxiliary_loss_mlp": 0.0106156, "balance_loss_clip": 1.02784729, "balance_loss_mlp": 1.04161787, "epoch": 0.046836013828348115, "flos": 20556649992960.0, "grad_norm": 2.3938019710870857, "language_loss": 0.78961205, "learning_rate": 3.978493121304151e-06, "loss": 0.81194067, "num_input_tokens_seen": 16726390, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 1.296875, "step": 779, "time_per_iteration": 2.429203987121582 }, { "auxiliary_loss_clip": 0.01165012, "auxiliary_loss_mlp": 0.01053025, "balance_loss_clip": 1.0218637, "balance_loss_mlp": 1.03878808, "epoch": 0.04689613708101608, "flos": 25446201125760.0, "grad_norm": 1.7069824197535406, "language_loss": 0.77102339, "learning_rate": 3.978437826669914e-06, "loss": 0.79320371, "num_input_tokens_seen": 16748965, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.265625, "step": 780, "time_per_iteration": 2.483863115310669 }, { "auxiliary_loss_clip": 0.01170145, "auxiliary_loss_mlp": 0.01059947, "balance_loss_clip": 1.02910745, "balance_loss_mlp": 1.04262638, "epoch": 0.04695626033368405, "flos": 23000325851520.0, "grad_norm": 1.9470585388344062, "language_loss": 0.76273519, "learning_rate": 3.9783824614303195e-06, "loss": 0.78503609, "num_input_tokens_seen": 16768620, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.2734375, "step": 781, "time_per_iteration": 2.428377628326416 }, { "auxiliary_loss_clip": 0.01176939, "auxiliary_loss_mlp": 0.01072466, "balance_loss_clip": 1.03796661, "balance_loss_mlp": 1.04257929, "epoch": 0.047016383586352024, "flos": 29132051713920.0, "grad_norm": 2.209148364136706, "language_loss": 0.73881859, "learning_rate": 3.978327025587344e-06, "loss": 0.76131266, "num_input_tokens_seen": 16789755, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 1.34375, "step": 782, "time_per_iteration": 2.456827402114868 }, { "auxiliary_loss_clip": 0.01171062, "auxiliary_loss_mlp": 0.01053215, "balance_loss_clip": 1.02298379, "balance_loss_mlp": 1.04179227, "epoch": 0.04707650683901999, "flos": 14975434448640.0, "grad_norm": 3.168953013497748, "language_loss": 0.80221462, "learning_rate": 3.978271519142967e-06, "loss": 0.82445741, "num_input_tokens_seen": 16807585, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.296875, "step": 783, "time_per_iteration": 2.392314910888672 }, { "auxiliary_loss_clip": 0.01166711, "auxiliary_loss_mlp": 0.01056846, "balance_loss_clip": 1.02725816, "balance_loss_mlp": 1.04139662, "epoch": 0.04713663009168796, "flos": 21650094864000.0, "grad_norm": 2.5535269215120957, "language_loss": 0.81434727, "learning_rate": 3.978215942099167e-06, "loss": 0.8365829, "num_input_tokens_seen": 16827220, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.25, "step": 784, "time_per_iteration": 2.4085748195648193 }, { "auxiliary_loss_clip": 0.0117326, "auxiliary_loss_mlp": 0.01058399, "balance_loss_clip": 1.02728581, "balance_loss_mlp": 1.0409857, "epoch": 0.04719675334435593, "flos": 21324320167680.0, "grad_norm": 2.743894936329185, "language_loss": 0.80728829, "learning_rate": 3.9781602944579285e-06, "loss": 0.82960492, "num_input_tokens_seen": 16846230, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 1.3203125, "step": 785, "time_per_iteration": 2.4267308712005615 }, { "auxiliary_loss_clip": 0.01173528, "auxiliary_loss_mlp": 0.01055203, "balance_loss_clip": 1.02485251, "balance_loss_mlp": 1.04502642, "epoch": 0.0472568765970239, "flos": 17930413301760.0, "grad_norm": 1.9076731639279216, "language_loss": 0.89660287, "learning_rate": 3.978104576221238e-06, "loss": 0.91889018, "num_input_tokens_seen": 16865325, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.28125, "step": 786, "time_per_iteration": 2.4019699096679688 }, { "auxiliary_loss_clip": 0.01171585, "auxiliary_loss_mlp": 0.01058545, "balance_loss_clip": 1.02435589, "balance_loss_mlp": 1.03746784, "epoch": 0.04731699984969187, "flos": 18076350251520.0, "grad_norm": 3.8833038203919887, "language_loss": 0.76674724, "learning_rate": 3.978048787391084e-06, "loss": 0.78904855, "num_input_tokens_seen": 16882930, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 1.34375, "step": 787, "time_per_iteration": 2.373610258102417 }, { "auxiliary_loss_clip": 0.01176536, "auxiliary_loss_mlp": 0.01058107, "balance_loss_clip": 1.02644455, "balance_loss_mlp": 1.04419088, "epoch": 0.047377123102359836, "flos": 23183968936320.0, "grad_norm": 4.053649185534547, "language_loss": 0.80823344, "learning_rate": 3.9779929279694565e-06, "loss": 0.83057988, "num_input_tokens_seen": 16900710, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 1.328125, "step": 788, "time_per_iteration": 2.4188549518585205 }, { "auxiliary_loss_clip": 0.01171514, "auxiliary_loss_mlp": 0.01060823, "balance_loss_clip": 1.02610922, "balance_loss_mlp": 1.04387689, "epoch": 0.04743724635502781, "flos": 22746681757440.0, "grad_norm": 2.0244616281489547, "language_loss": 0.84739041, "learning_rate": 3.977936997958349e-06, "loss": 0.86971378, "num_input_tokens_seen": 16919210, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 1.2734375, "step": 789, "time_per_iteration": 2.4152109622955322 }, { "auxiliary_loss_clip": 0.01172058, "auxiliary_loss_mlp": 0.01064209, "balance_loss_clip": 1.03479958, "balance_loss_mlp": 1.04249954, "epoch": 0.04749736960769577, "flos": 17237736460800.0, "grad_norm": 2.4696918852654024, "language_loss": 0.81907129, "learning_rate": 3.977880997359758e-06, "loss": 0.84143388, "num_input_tokens_seen": 16937125, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.296875, "step": 790, "time_per_iteration": 2.41382098197937 }, { "auxiliary_loss_clip": 0.01168927, "auxiliary_loss_mlp": 0.01055267, "balance_loss_clip": 1.02436805, "balance_loss_mlp": 1.04008615, "epoch": 0.047557492860363745, "flos": 40477672039680.0, "grad_norm": 2.1721899593907517, "language_loss": 0.8778193, "learning_rate": 3.977824926175682e-06, "loss": 0.90006131, "num_input_tokens_seen": 16958610, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.28125, "step": 791, "time_per_iteration": 2.5817065238952637 }, { "auxiliary_loss_clip": 0.01172967, "auxiliary_loss_mlp": 0.01058892, "balance_loss_clip": 1.02744484, "balance_loss_mlp": 1.04207683, "epoch": 0.04761761611303172, "flos": 18697001224320.0, "grad_norm": 2.1744271782528704, "language_loss": 0.90019238, "learning_rate": 3.977768784408122e-06, "loss": 0.92251098, "num_input_tokens_seen": 16977300, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 1.3046875, "step": 792, "time_per_iteration": 2.4133920669555664 }, { "auxiliary_loss_clip": 0.01170189, "auxiliary_loss_mlp": 0.01068953, "balance_loss_clip": 1.03831649, "balance_loss_mlp": 1.038118, "epoch": 0.04767773936569968, "flos": 20920968696960.0, "grad_norm": 1.9371333153222121, "language_loss": 0.73367131, "learning_rate": 3.977712572059081e-06, "loss": 0.75606275, "num_input_tokens_seen": 16994950, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.3203125, "step": 793, "time_per_iteration": 2.385101795196533 }, { "auxiliary_loss_clip": 0.01173409, "auxiliary_loss_mlp": 0.01053075, "balance_loss_clip": 1.02229476, "balance_loss_mlp": 1.04118943, "epoch": 0.047737862618367655, "flos": 23731546700160.0, "grad_norm": 2.663201040336238, "language_loss": 0.85657656, "learning_rate": 3.977656289130567e-06, "loss": 0.8788414, "num_input_tokens_seen": 17014760, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.328125, "step": 794, "time_per_iteration": 2.423823833465576 }, { "auxiliary_loss_clip": 0.01172468, "auxiliary_loss_mlp": 0.01062428, "balance_loss_clip": 1.03012252, "balance_loss_mlp": 1.0396831, "epoch": 0.04779798587103562, "flos": 23694643526400.0, "grad_norm": 2.6155256508860307, "language_loss": 0.69553244, "learning_rate": 3.977599935624586e-06, "loss": 0.71788138, "num_input_tokens_seen": 17032715, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 1.328125, "step": 795, "time_per_iteration": 2.4013278484344482 }, { "auxiliary_loss_clip": 0.01169285, "auxiliary_loss_mlp": 0.01065853, "balance_loss_clip": 1.03307092, "balance_loss_mlp": 1.04058623, "epoch": 0.04785810912370359, "flos": 23182572481920.0, "grad_norm": 2.438329569382553, "language_loss": 0.80910087, "learning_rate": 3.977543511543151e-06, "loss": 0.83145225, "num_input_tokens_seen": 17052215, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 1.2890625, "step": 796, "time_per_iteration": 2.425485610961914 }, { "auxiliary_loss_clip": 0.01169091, "auxiliary_loss_mlp": 0.01057849, "balance_loss_clip": 1.02528143, "balance_loss_mlp": 1.04067898, "epoch": 0.047918232376371564, "flos": 18039656545920.0, "grad_norm": 2.4290765407806587, "language_loss": 0.81627935, "learning_rate": 3.977487016888274e-06, "loss": 0.83854878, "num_input_tokens_seen": 17069225, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 1.28125, "step": 797, "time_per_iteration": 2.385831117630005 }, { "auxiliary_loss_clip": 0.01053266, "auxiliary_loss_mlp": 0.01004369, "balance_loss_clip": 0.99902827, "balance_loss_mlp": 1.01167202, "epoch": 0.04797835562903953, "flos": 62439400632960.0, "grad_norm": 0.9135418856904373, "language_loss": 0.64484239, "learning_rate": 3.977430451661972e-06, "loss": 0.66541874, "num_input_tokens_seen": 17126680, "router_z_loss_clip": 0.0534668, "router_z_loss_mlp": 0.41601562, "step": 798, "time_per_iteration": 2.9660699367523193 }, { "auxiliary_loss_clip": 0.01174345, "auxiliary_loss_mlp": 0.01058284, "balance_loss_clip": 1.02762318, "balance_loss_mlp": 1.03999674, "epoch": 0.0480384788817075, "flos": 21506217684480.0, "grad_norm": 1.9946740548056843, "language_loss": 0.90883076, "learning_rate": 3.9773738158662655e-06, "loss": 0.93115699, "num_input_tokens_seen": 17144835, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 1.34375, "step": 799, "time_per_iteration": 2.409356117248535 }, { "auxiliary_loss_clip": 0.01171554, "auxiliary_loss_mlp": 0.01055537, "balance_loss_clip": 1.02487683, "balance_loss_mlp": 1.04478359, "epoch": 0.048098602134375466, "flos": 21725611868160.0, "grad_norm": 2.0226049753764235, "language_loss": 0.86634338, "learning_rate": 3.977317109503172e-06, "loss": 0.8886143, "num_input_tokens_seen": 17165030, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 1.265625, "step": 800, "time_per_iteration": 2.443582773208618 }, { "auxiliary_loss_clip": 0.01173783, "auxiliary_loss_mlp": 0.01063135, "balance_loss_clip": 1.03245056, "balance_loss_mlp": 1.04335332, "epoch": 0.04815872538704344, "flos": 22929940817280.0, "grad_norm": 3.5504707427095092, "language_loss": 0.83880752, "learning_rate": 3.977260332574718e-06, "loss": 0.86117673, "num_input_tokens_seen": 17184895, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 1.3046875, "step": 801, "time_per_iteration": 2.430955410003662 }, { "auxiliary_loss_clip": 0.01170878, "auxiliary_loss_mlp": 0.01060558, "balance_loss_clip": 1.02977824, "balance_loss_mlp": 1.04109502, "epoch": 0.04821884863971141, "flos": 43173176601600.0, "grad_norm": 2.5459362418907205, "language_loss": 0.79219079, "learning_rate": 3.977203485082928e-06, "loss": 0.81450516, "num_input_tokens_seen": 17208225, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 1.296875, "step": 802, "time_per_iteration": 2.6055495738983154 }, { "auxiliary_loss_clip": 0.01170224, "auxiliary_loss_mlp": 0.01054641, "balance_loss_clip": 1.02426648, "balance_loss_mlp": 1.04124594, "epoch": 0.048278971892379376, "flos": 18619145159040.0, "grad_norm": 1.745606544997716, "language_loss": 0.86103964, "learning_rate": 3.977146567029833e-06, "loss": 0.88328832, "num_input_tokens_seen": 17226305, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.2890625, "step": 803, "time_per_iteration": 3.791219711303711 }, { "auxiliary_loss_clip": 0.0116548, "auxiliary_loss_mlp": 0.0105238, "balance_loss_clip": 1.02250648, "balance_loss_mlp": 1.04170287, "epoch": 0.04833909514504735, "flos": 20229024994560.0, "grad_norm": 2.2811621272757576, "language_loss": 0.85222125, "learning_rate": 3.977089578417462e-06, "loss": 0.87439978, "num_input_tokens_seen": 17244545, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.234375, "step": 804, "time_per_iteration": 2.4115700721740723 }, { "auxiliary_loss_clip": 0.0116977, "auxiliary_loss_mlp": 0.01048593, "balance_loss_clip": 1.01893377, "balance_loss_mlp": 1.04267776, "epoch": 0.04839921839771532, "flos": 24644001749760.0, "grad_norm": 2.490447923729626, "language_loss": 0.86260319, "learning_rate": 3.9770325192478504e-06, "loss": 0.88478678, "num_input_tokens_seen": 17265730, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.265625, "step": 805, "time_per_iteration": 2.454223155975342 }, { "auxiliary_loss_clip": 0.01163325, "auxiliary_loss_mlp": 0.01052235, "balance_loss_clip": 1.02326751, "balance_loss_mlp": 1.03973639, "epoch": 0.048459341650383285, "flos": 24826283291520.0, "grad_norm": 2.60907289230247, "language_loss": 0.67868835, "learning_rate": 3.9769753895230324e-06, "loss": 0.70084393, "num_input_tokens_seen": 17284820, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.234375, "step": 806, "time_per_iteration": 5.261777639389038 }, { "auxiliary_loss_clip": 0.01165136, "auxiliary_loss_mlp": 0.0105429, "balance_loss_clip": 1.02570355, "balance_loss_mlp": 1.04075348, "epoch": 0.04851946490305126, "flos": 22162130997120.0, "grad_norm": 5.154972087774901, "language_loss": 0.7642802, "learning_rate": 3.976918189245049e-06, "loss": 0.78647453, "num_input_tokens_seen": 17305085, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.2421875, "step": 807, "time_per_iteration": 2.4211854934692383 }, { "auxiliary_loss_clip": 0.01164869, "auxiliary_loss_mlp": 0.0106269, "balance_loss_clip": 1.0339365, "balance_loss_mlp": 1.03880262, "epoch": 0.04857958815571922, "flos": 19791004677120.0, "grad_norm": 2.46456303042586, "language_loss": 0.86459714, "learning_rate": 3.9768609184159405e-06, "loss": 0.88687277, "num_input_tokens_seen": 17322715, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.265625, "step": 808, "time_per_iteration": 3.7626330852508545 }, { "auxiliary_loss_clip": 0.01170461, "auxiliary_loss_mlp": 0.01053986, "balance_loss_clip": 1.02507806, "balance_loss_mlp": 1.04022026, "epoch": 0.048639711408387194, "flos": 18696966312960.0, "grad_norm": 2.3646723308193276, "language_loss": 0.89717674, "learning_rate": 3.976803577037751e-06, "loss": 0.91942126, "num_input_tokens_seen": 17341455, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.296875, "step": 809, "time_per_iteration": 2.4033820629119873 }, { "auxiliary_loss_clip": 0.01170753, "auxiliary_loss_mlp": 0.01060366, "balance_loss_clip": 1.02941978, "balance_loss_mlp": 1.04319715, "epoch": 0.048699834661055166, "flos": 24862348592640.0, "grad_norm": 1.9856387765986683, "language_loss": 0.84460419, "learning_rate": 3.976746165112527e-06, "loss": 0.8669154, "num_input_tokens_seen": 17360765, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.28125, "step": 810, "time_per_iteration": 2.501471757888794 }, { "auxiliary_loss_clip": 0.01171227, "auxiliary_loss_mlp": 0.01050112, "balance_loss_clip": 1.02076244, "balance_loss_mlp": 1.04236078, "epoch": 0.04875995791372313, "flos": 20702970967680.0, "grad_norm": 5.847958811419739, "language_loss": 0.80468845, "learning_rate": 3.976688682642317e-06, "loss": 0.82690179, "num_input_tokens_seen": 17380625, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.2890625, "step": 811, "time_per_iteration": 2.4055933952331543 }, { "auxiliary_loss_clip": 0.01161484, "auxiliary_loss_mlp": 0.01058142, "balance_loss_clip": 1.02891231, "balance_loss_mlp": 1.03926706, "epoch": 0.048820081166391104, "flos": 18587304132480.0, "grad_norm": 1.782692438004299, "language_loss": 0.74147636, "learning_rate": 3.976631129629173e-06, "loss": 0.76367265, "num_input_tokens_seen": 17399355, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.21875, "step": 812, "time_per_iteration": 2.411888360977173 }, { "auxiliary_loss_clip": 0.01164119, "auxiliary_loss_mlp": 0.01059981, "balance_loss_clip": 1.03265858, "balance_loss_mlp": 1.04155898, "epoch": 0.04888020441905907, "flos": 22706322359040.0, "grad_norm": 1.956145964727686, "language_loss": 0.89826584, "learning_rate": 3.9765735060751475e-06, "loss": 0.92050683, "num_input_tokens_seen": 17418240, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.2265625, "step": 813, "time_per_iteration": 2.4079573154449463 }, { "auxiliary_loss_clip": 0.011635, "auxiliary_loss_mlp": 0.01050011, "balance_loss_clip": 1.02287924, "balance_loss_mlp": 1.04093742, "epoch": 0.04894032767172704, "flos": 22783235817600.0, "grad_norm": 2.4332683364797165, "language_loss": 0.74885005, "learning_rate": 3.976515811982298e-06, "loss": 0.77098519, "num_input_tokens_seen": 17436250, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.2265625, "step": 814, "time_per_iteration": 2.4603400230407715 }, { "auxiliary_loss_clip": 0.0116791, "auxiliary_loss_mlp": 0.01061613, "balance_loss_clip": 1.03182244, "balance_loss_mlp": 1.04210234, "epoch": 0.04900045092439501, "flos": 25515084971520.0, "grad_norm": 2.394562762664596, "language_loss": 0.83616436, "learning_rate": 3.976458047352684e-06, "loss": 0.85845953, "num_input_tokens_seen": 17455750, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.2578125, "step": 815, "time_per_iteration": 2.4485135078430176 }, { "auxiliary_loss_clip": 0.01166777, "auxiliary_loss_mlp": 0.01055653, "balance_loss_clip": 1.02430129, "balance_loss_mlp": 1.03958774, "epoch": 0.04906057417706298, "flos": 25956945538560.0, "grad_norm": 2.157888550916716, "language_loss": 0.90636873, "learning_rate": 3.976400212188366e-06, "loss": 0.92859304, "num_input_tokens_seen": 17474995, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 1.2734375, "step": 816, "time_per_iteration": 2.444671630859375 }, { "auxiliary_loss_clip": 0.01168071, "auxiliary_loss_mlp": 0.01055868, "balance_loss_clip": 1.02784157, "balance_loss_mlp": 1.04176772, "epoch": 0.04912069742973095, "flos": 18623648724480.0, "grad_norm": 2.630038287340091, "language_loss": 0.79744601, "learning_rate": 3.976342306491408e-06, "loss": 0.81968546, "num_input_tokens_seen": 17493395, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.265625, "step": 817, "time_per_iteration": 2.4015910625457764 }, { "auxiliary_loss_clip": 0.01165215, "auxiliary_loss_mlp": 0.01062397, "balance_loss_clip": 1.03328562, "balance_loss_mlp": 1.04173994, "epoch": 0.049180820682398915, "flos": 23698553598720.0, "grad_norm": 2.698144736986534, "language_loss": 0.84772664, "learning_rate": 3.976284330263878e-06, "loss": 0.87000275, "num_input_tokens_seen": 17514565, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.234375, "step": 818, "time_per_iteration": 2.4503378868103027 }, { "auxiliary_loss_clip": 0.01169224, "auxiliary_loss_mlp": 0.01056172, "balance_loss_clip": 1.0257256, "balance_loss_mlp": 1.04177284, "epoch": 0.04924094393506689, "flos": 22419266227200.0, "grad_norm": 3.614464798815647, "language_loss": 0.7506969, "learning_rate": 3.976226283507843e-06, "loss": 0.77295083, "num_input_tokens_seen": 17534590, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.2734375, "step": 819, "time_per_iteration": 2.423330783843994 }, { "auxiliary_loss_clip": 0.01167272, "auxiliary_loss_mlp": 0.01055403, "balance_loss_clip": 1.02767491, "balance_loss_mlp": 1.04262042, "epoch": 0.04930106718773486, "flos": 15737448983040.0, "grad_norm": 2.234062200713571, "language_loss": 0.85044587, "learning_rate": 3.976168166225375e-06, "loss": 0.87267256, "num_input_tokens_seen": 17551900, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.25, "step": 820, "time_per_iteration": 2.420631170272827 }, { "auxiliary_loss_clip": 0.01168213, "auxiliary_loss_mlp": 0.01054757, "balance_loss_clip": 1.02502584, "balance_loss_mlp": 1.04121029, "epoch": 0.049361190440402825, "flos": 26249412931200.0, "grad_norm": 2.000199948064709, "language_loss": 0.90914762, "learning_rate": 3.976109978418549e-06, "loss": 0.93137735, "num_input_tokens_seen": 17571485, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.265625, "step": 821, "time_per_iteration": 2.4498400688171387 }, { "auxiliary_loss_clip": 0.01167231, "auxiliary_loss_mlp": 0.01064154, "balance_loss_clip": 1.03509116, "balance_loss_mlp": 1.0418961, "epoch": 0.0494213136930708, "flos": 21251281870080.0, "grad_norm": 1.832087213668366, "language_loss": 0.8943603, "learning_rate": 3.976051720089441e-06, "loss": 0.91667426, "num_input_tokens_seen": 17591410, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.25, "step": 822, "time_per_iteration": 2.4395205974578857 }, { "auxiliary_loss_clip": 0.01166594, "auxiliary_loss_mlp": 0.01057127, "balance_loss_clip": 1.02496481, "balance_loss_mlp": 1.0421176, "epoch": 0.04948143694573876, "flos": 27964241913600.0, "grad_norm": 6.6433608209893436, "language_loss": 0.67021036, "learning_rate": 3.9759933912401304e-06, "loss": 0.69244754, "num_input_tokens_seen": 17612010, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 1.25, "step": 823, "time_per_iteration": 2.4612014293670654 }, { "auxiliary_loss_clip": 0.01050969, "auxiliary_loss_mlp": 0.01017236, "balance_loss_clip": 1.01137078, "balance_loss_mlp": 1.00644159, "epoch": 0.049541560198406734, "flos": 66178250398080.0, "grad_norm": 1.3172251732625322, "language_loss": 0.62187296, "learning_rate": 3.975934991872698e-06, "loss": 0.642555, "num_input_tokens_seen": 17673430, "router_z_loss_clip": 0.05859375, "router_z_loss_mlp": 0.4453125, "step": 824, "time_per_iteration": 3.1524150371551514 }, { "auxiliary_loss_clip": 0.0116972, "auxiliary_loss_mlp": 0.01061632, "balance_loss_clip": 1.02976692, "balance_loss_mlp": 1.04172587, "epoch": 0.049601683451074706, "flos": 22891606277760.0, "grad_norm": 1.8254047317461848, "language_loss": 0.90296292, "learning_rate": 3.975876521989229e-06, "loss": 0.9252764, "num_input_tokens_seen": 17689545, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 1.28125, "step": 825, "time_per_iteration": 2.4061501026153564 }, { "auxiliary_loss_clip": 0.01170339, "auxiliary_loss_mlp": 0.0106166, "balance_loss_clip": 1.02899635, "balance_loss_mlp": 1.04297948, "epoch": 0.04966180670374267, "flos": 21432585893760.0, "grad_norm": 2.234887316682884, "language_loss": 0.66441983, "learning_rate": 3.975817981591809e-06, "loss": 0.6867398, "num_input_tokens_seen": 17705965, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 1.2734375, "step": 826, "time_per_iteration": 2.4316608905792236 }, { "auxiliary_loss_clip": 0.01170613, "auxiliary_loss_mlp": 0.01059237, "balance_loss_clip": 1.02803993, "balance_loss_mlp": 1.04257441, "epoch": 0.04972192995641064, "flos": 23106392161920.0, "grad_norm": 2.0648308008719636, "language_loss": 0.78250402, "learning_rate": 3.975759370682528e-06, "loss": 0.80480254, "num_input_tokens_seen": 17724580, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.28125, "step": 827, "time_per_iteration": 2.4437127113342285 }, { "auxiliary_loss_clip": 0.01172813, "auxiliary_loss_mlp": 0.01065139, "balance_loss_clip": 1.03342938, "balance_loss_mlp": 1.04325986, "epoch": 0.04978205320907861, "flos": 40404563919360.0, "grad_norm": 1.6663251374112558, "language_loss": 0.78703785, "learning_rate": 3.975700689263477e-06, "loss": 0.80941737, "num_input_tokens_seen": 17747755, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 1.296875, "step": 828, "time_per_iteration": 2.6153178215026855 }, { "auxiliary_loss_clip": 0.01162054, "auxiliary_loss_mlp": 0.01054695, "balance_loss_clip": 1.02622795, "balance_loss_mlp": 1.0404613, "epoch": 0.04984217646174658, "flos": 25227365523840.0, "grad_norm": 2.043692486008699, "language_loss": 0.83223975, "learning_rate": 3.97564193733675e-06, "loss": 0.85440719, "num_input_tokens_seen": 17768550, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.21875, "step": 829, "time_per_iteration": 2.4516046047210693 }, { "auxiliary_loss_clip": 0.01167924, "auxiliary_loss_mlp": 0.01064178, "balance_loss_clip": 1.0296793, "balance_loss_mlp": 1.03958869, "epoch": 0.04990229971441455, "flos": 15958763291520.0, "grad_norm": 1.9552786395143507, "language_loss": 0.75125033, "learning_rate": 3.975583114904446e-06, "loss": 0.77357137, "num_input_tokens_seen": 17786080, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 1.28125, "step": 830, "time_per_iteration": 2.410106897354126 }, { "auxiliary_loss_clip": 0.01168344, "auxiliary_loss_mlp": 0.01063958, "balance_loss_clip": 1.03321362, "balance_loss_mlp": 1.039922, "epoch": 0.04996242296708252, "flos": 18404149806720.0, "grad_norm": 1.9775454545213287, "language_loss": 0.79518765, "learning_rate": 3.975524221968661e-06, "loss": 0.81751066, "num_input_tokens_seen": 17803635, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.28125, "step": 831, "time_per_iteration": 2.4046993255615234 }, { "auxiliary_loss_clip": 0.01170035, "auxiliary_loss_mlp": 0.01065907, "balance_loss_clip": 1.03642654, "balance_loss_mlp": 1.04187751, "epoch": 0.05002254621975049, "flos": 17857095713280.0, "grad_norm": 2.480317958948075, "language_loss": 0.91368961, "learning_rate": 3.975465258531499e-06, "loss": 0.93604904, "num_input_tokens_seen": 17822190, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.28125, "step": 832, "time_per_iteration": 2.4149022102355957 }, { "auxiliary_loss_clip": 0.01162742, "auxiliary_loss_mlp": 0.01063583, "balance_loss_clip": 1.03423357, "balance_loss_mlp": 1.04163289, "epoch": 0.050082669472418455, "flos": 45658538490240.0, "grad_norm": 2.126591231557392, "language_loss": 0.83265626, "learning_rate": 3.9754062245950625e-06, "loss": 0.85491955, "num_input_tokens_seen": 17846915, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.2109375, "step": 833, "time_per_iteration": 2.5969738960266113 }, { "auxiliary_loss_clip": 0.01164549, "auxiliary_loss_mlp": 0.01053187, "balance_loss_clip": 1.02181077, "balance_loss_mlp": 1.0378592, "epoch": 0.05014279272508643, "flos": 37960538947200.0, "grad_norm": 2.5697450136119495, "language_loss": 0.82563829, "learning_rate": 3.975347120161459e-06, "loss": 0.84781563, "num_input_tokens_seen": 17867270, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 1.265625, "step": 834, "time_per_iteration": 2.5611331462860107 }, { "auxiliary_loss_clip": 0.0116794, "auxiliary_loss_mlp": 0.01056384, "balance_loss_clip": 1.02417338, "balance_loss_mlp": 1.04004765, "epoch": 0.0502029159777544, "flos": 20995124158080.0, "grad_norm": 2.2147712592351776, "language_loss": 0.91696298, "learning_rate": 3.975287945232799e-06, "loss": 0.93920618, "num_input_tokens_seen": 17884880, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 1.28125, "step": 835, "time_per_iteration": 2.3931682109832764 }, { "auxiliary_loss_clip": 0.01169635, "auxiliary_loss_mlp": 0.0106848, "balance_loss_clip": 1.03572142, "balance_loss_mlp": 1.03864908, "epoch": 0.050263039230422364, "flos": 15887156359680.0, "grad_norm": 8.942841368746471, "language_loss": 0.76724601, "learning_rate": 3.975228699811193e-06, "loss": 0.78962719, "num_input_tokens_seen": 17903695, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 1.3125, "step": 836, "time_per_iteration": 2.4156110286712646 }, { "auxiliary_loss_clip": 0.01163802, "auxiliary_loss_mlp": 0.01066891, "balance_loss_clip": 1.03853154, "balance_loss_mlp": 1.04223275, "epoch": 0.050323162483090336, "flos": 23731616522880.0, "grad_norm": 2.121859205210282, "language_loss": 0.83580768, "learning_rate": 3.975169383898755e-06, "loss": 0.8581146, "num_input_tokens_seen": 17920745, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.21875, "step": 837, "time_per_iteration": 2.415476083755493 }, { "auxiliary_loss_clip": 0.01164628, "auxiliary_loss_mlp": 0.01063927, "balance_loss_clip": 1.03463697, "balance_loss_mlp": 1.0413357, "epoch": 0.0503832857357583, "flos": 20265195029760.0, "grad_norm": 2.4680140494286156, "language_loss": 0.7328164, "learning_rate": 3.975109997497604e-06, "loss": 0.75510192, "num_input_tokens_seen": 17938220, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.234375, "step": 838, "time_per_iteration": 2.435312271118164 }, { "auxiliary_loss_clip": 0.01160911, "auxiliary_loss_mlp": 0.01063132, "balance_loss_clip": 1.03287721, "balance_loss_mlp": 1.03826404, "epoch": 0.05044340898842627, "flos": 17784057415680.0, "grad_norm": 2.914937148434028, "language_loss": 0.83091825, "learning_rate": 3.975050540609857e-06, "loss": 0.85315871, "num_input_tokens_seen": 17957325, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.2265625, "step": 839, "time_per_iteration": 2.4032273292541504 }, { "auxiliary_loss_clip": 0.01159855, "auxiliary_loss_mlp": 0.01056911, "balance_loss_clip": 1.02784753, "balance_loss_mlp": 1.04024374, "epoch": 0.050503532241094246, "flos": 22965412625280.0, "grad_norm": 1.7597979747182033, "language_loss": 0.8568148, "learning_rate": 3.9749910132376355e-06, "loss": 0.87898248, "num_input_tokens_seen": 17975875, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.203125, "step": 840, "time_per_iteration": 2.453213691711426 }, { "auxiliary_loss_clip": 0.0116538, "auxiliary_loss_mlp": 0.01057513, "balance_loss_clip": 1.02687609, "balance_loss_mlp": 1.04037189, "epoch": 0.05056365549376221, "flos": 22776078988800.0, "grad_norm": 1.9817718431560314, "language_loss": 0.9464941, "learning_rate": 3.974931415383066e-06, "loss": 0.96872306, "num_input_tokens_seen": 17994340, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 1.25, "step": 841, "time_per_iteration": 2.447728157043457 }, { "auxiliary_loss_clip": 0.01166064, "auxiliary_loss_mlp": 0.01059588, "balance_loss_clip": 1.03023815, "balance_loss_mlp": 1.03955817, "epoch": 0.05062377874643018, "flos": 30915729630720.0, "grad_norm": 2.1937082555241596, "language_loss": 0.77494878, "learning_rate": 3.974871747048274e-06, "loss": 0.79720527, "num_input_tokens_seen": 18015260, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.265625, "step": 842, "time_per_iteration": 3.926527976989746 }, { "auxiliary_loss_clip": 0.01172329, "auxiliary_loss_mlp": 0.01070474, "balance_loss_clip": 1.03757167, "balance_loss_mlp": 1.04413319, "epoch": 0.05068390199909815, "flos": 19646115068160.0, "grad_norm": 2.3304262601049843, "language_loss": 0.78067744, "learning_rate": 3.97481200823539e-06, "loss": 0.80310547, "num_input_tokens_seen": 18033960, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 1.28125, "step": 843, "time_per_iteration": 2.4005467891693115 }, { "auxiliary_loss_clip": 0.01167731, "auxiliary_loss_mlp": 0.01048925, "balance_loss_clip": 1.01936114, "balance_loss_mlp": 1.04121172, "epoch": 0.05074402525176612, "flos": 37960573858560.0, "grad_norm": 2.398313254478142, "language_loss": 0.83207279, "learning_rate": 3.974752198946545e-06, "loss": 0.85423934, "num_input_tokens_seen": 18056700, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.265625, "step": 844, "time_per_iteration": 2.590186595916748 }, { "auxiliary_loss_clip": 0.01160503, "auxiliary_loss_mlp": 0.01058272, "balance_loss_clip": 1.02806437, "balance_loss_mlp": 1.03811467, "epoch": 0.05080414850443409, "flos": 22053516157440.0, "grad_norm": 2.354765899656259, "language_loss": 0.76544082, "learning_rate": 3.974692319183873e-06, "loss": 0.78762859, "num_input_tokens_seen": 18075815, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.2265625, "step": 845, "time_per_iteration": 2.4088919162750244 }, { "auxiliary_loss_clip": 0.01161682, "auxiliary_loss_mlp": 0.01059081, "balance_loss_clip": 1.02811074, "balance_loss_mlp": 1.03650367, "epoch": 0.05086427175710206, "flos": 20224870542720.0, "grad_norm": 1.7049887007445408, "language_loss": 0.87393314, "learning_rate": 3.974632368949513e-06, "loss": 0.89614075, "num_input_tokens_seen": 18095095, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.25, "step": 846, "time_per_iteration": 3.855921506881714 }, { "auxiliary_loss_clip": 0.01166303, "auxiliary_loss_mlp": 0.01054591, "balance_loss_clip": 1.02493167, "balance_loss_mlp": 1.04270983, "epoch": 0.05092439500977003, "flos": 15158309483520.0, "grad_norm": 2.118490048729354, "language_loss": 0.87353724, "learning_rate": 3.974572348245602e-06, "loss": 0.89574617, "num_input_tokens_seen": 18112675, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.234375, "step": 847, "time_per_iteration": 2.3899219036102295 }, { "auxiliary_loss_clip": 0.01158489, "auxiliary_loss_mlp": 0.01055071, "balance_loss_clip": 1.0248158, "balance_loss_mlp": 1.03820229, "epoch": 0.050984518262437994, "flos": 22054039827840.0, "grad_norm": 2.2959085046618943, "language_loss": 0.81882077, "learning_rate": 3.974512257074284e-06, "loss": 0.84095639, "num_input_tokens_seen": 18130745, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.203125, "step": 848, "time_per_iteration": 3.773952007293701 }, { "auxiliary_loss_clip": 0.01164005, "auxiliary_loss_mlp": 0.01058961, "balance_loss_clip": 1.02872968, "balance_loss_mlp": 1.04196203, "epoch": 0.05104464151510597, "flos": 30224065219200.0, "grad_norm": 2.2019890987313504, "language_loss": 0.87174815, "learning_rate": 3.974452095437701e-06, "loss": 0.89397776, "num_input_tokens_seen": 18152410, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.21875, "step": 849, "time_per_iteration": 2.490201711654663 }, { "auxiliary_loss_clip": 0.01159582, "auxiliary_loss_mlp": 0.01052927, "balance_loss_clip": 1.02360129, "balance_loss_mlp": 1.03816199, "epoch": 0.05110476476777394, "flos": 18331914470400.0, "grad_norm": 2.0239864852213465, "language_loss": 0.83400553, "learning_rate": 3.974391863338003e-06, "loss": 0.8561306, "num_input_tokens_seen": 18170870, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.21875, "step": 850, "time_per_iteration": 2.468676805496216 }, { "auxiliary_loss_clip": 0.01160646, "auxiliary_loss_mlp": 0.01051729, "balance_loss_clip": 1.02292764, "balance_loss_mlp": 1.03897095, "epoch": 0.051164888020441904, "flos": 37997197741440.0, "grad_norm": 1.9923005803260347, "language_loss": 0.65049136, "learning_rate": 3.974331560777338e-06, "loss": 0.67261505, "num_input_tokens_seen": 18191555, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.21875, "step": 851, "time_per_iteration": 2.573582172393799 }, { "auxiliary_loss_clip": 0.01158165, "auxiliary_loss_mlp": 0.0105297, "balance_loss_clip": 1.02218974, "balance_loss_mlp": 1.03646827, "epoch": 0.051225011273109876, "flos": 23037543227520.0, "grad_norm": 2.750400779171418, "language_loss": 0.83152038, "learning_rate": 3.974271187757857e-06, "loss": 0.85363173, "num_input_tokens_seen": 18208620, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.21875, "step": 852, "time_per_iteration": 2.4141244888305664 }, { "auxiliary_loss_clip": 0.01166429, "auxiliary_loss_mlp": 0.0106459, "balance_loss_clip": 1.03333354, "balance_loss_mlp": 1.04123831, "epoch": 0.05128513452577785, "flos": 18258841261440.0, "grad_norm": 2.0184171365603514, "language_loss": 0.80007803, "learning_rate": 3.974210744281717e-06, "loss": 0.82238829, "num_input_tokens_seen": 18226370, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.25, "step": 853, "time_per_iteration": 2.405571937561035 }, { "auxiliary_loss_clip": 0.01160161, "auxiliary_loss_mlp": 0.01055026, "balance_loss_clip": 1.02531874, "balance_loss_mlp": 1.03994238, "epoch": 0.05134525777844581, "flos": 27197723813760.0, "grad_norm": 1.9780043386979285, "language_loss": 0.75332499, "learning_rate": 3.974150230351074e-06, "loss": 0.77547681, "num_input_tokens_seen": 18247075, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.203125, "step": 854, "time_per_iteration": 2.4725728034973145 }, { "auxiliary_loss_clip": 0.01164939, "auxiliary_loss_mlp": 0.01053258, "balance_loss_clip": 1.02355087, "balance_loss_mlp": 1.04014957, "epoch": 0.051405381031113785, "flos": 28361099871360.0, "grad_norm": 2.159727043901847, "language_loss": 0.81719911, "learning_rate": 3.974089645968087e-06, "loss": 0.83938104, "num_input_tokens_seen": 18265680, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.25, "step": 855, "time_per_iteration": 2.4578967094421387 }, { "auxiliary_loss_clip": 0.01051068, "auxiliary_loss_mlp": 0.01009121, "balance_loss_clip": 1.00416172, "balance_loss_mlp": 1.01090991, "epoch": 0.05146550428378175, "flos": 65614855921920.0, "grad_norm": 0.9758737055950394, "language_loss": 0.65607464, "learning_rate": 3.974028991134917e-06, "loss": 0.67667657, "num_input_tokens_seen": 18327015, "router_z_loss_clip": 0.04956055, "router_z_loss_mlp": 0.40234375, "step": 856, "time_per_iteration": 3.051262855529785 }, { "auxiliary_loss_clip": 0.01158072, "auxiliary_loss_mlp": 0.01047891, "balance_loss_clip": 1.01911426, "balance_loss_mlp": 1.03875589, "epoch": 0.05152562753644972, "flos": 22053760536960.0, "grad_norm": 3.191385805610801, "language_loss": 0.76746464, "learning_rate": 3.973968265853732e-06, "loss": 0.7895242, "num_input_tokens_seen": 18345235, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.1953125, "step": 857, "time_per_iteration": 2.4042773246765137 }, { "auxiliary_loss_clip": 0.01162847, "auxiliary_loss_mlp": 0.01053205, "balance_loss_clip": 1.02392697, "balance_loss_mlp": 1.03982496, "epoch": 0.051585750789117694, "flos": 18508714928640.0, "grad_norm": 2.3876640997155048, "language_loss": 0.88652521, "learning_rate": 3.973907470126697e-06, "loss": 0.9086858, "num_input_tokens_seen": 18362350, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.234375, "step": 858, "time_per_iteration": 2.413517475128174 }, { "auxiliary_loss_clip": 0.01161464, "auxiliary_loss_mlp": 0.01053192, "balance_loss_clip": 1.02322316, "balance_loss_mlp": 1.03962851, "epoch": 0.05164587404178566, "flos": 23729172727680.0, "grad_norm": 2.7678383094014634, "language_loss": 0.75064158, "learning_rate": 3.973846603955982e-06, "loss": 0.77278817, "num_input_tokens_seen": 18383390, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.21875, "step": 859, "time_per_iteration": 2.435861825942993 }, { "auxiliary_loss_clip": 0.01168612, "auxiliary_loss_mlp": 0.0106378, "balance_loss_clip": 1.0315932, "balance_loss_mlp": 1.04018474, "epoch": 0.05170599729445363, "flos": 16251963822720.0, "grad_norm": 2.468489423187343, "language_loss": 0.90703034, "learning_rate": 3.973785667343758e-06, "loss": 0.92935425, "num_input_tokens_seen": 18399220, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 1.28125, "step": 860, "time_per_iteration": 2.389540672302246 }, { "auxiliary_loss_clip": 0.01163009, "auxiliary_loss_mlp": 0.01049808, "balance_loss_clip": 1.02217507, "balance_loss_mlp": 1.04159057, "epoch": 0.0517661205471216, "flos": 23984841680640.0, "grad_norm": 1.99597550647492, "language_loss": 0.82325977, "learning_rate": 3.973724660292202e-06, "loss": 0.84538794, "num_input_tokens_seen": 18419005, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.21875, "step": 861, "time_per_iteration": 2.447273015975952 }, { "auxiliary_loss_clip": 0.01163287, "auxiliary_loss_mlp": 0.01049733, "balance_loss_clip": 1.02100301, "balance_loss_mlp": 1.04005361, "epoch": 0.05182624379978957, "flos": 29276452563840.0, "grad_norm": 2.184600867013007, "language_loss": 0.78252262, "learning_rate": 3.973663582803489e-06, "loss": 0.80465281, "num_input_tokens_seen": 18440550, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.234375, "step": 862, "time_per_iteration": 2.4645631313323975 }, { "auxiliary_loss_clip": 0.01160461, "auxiliary_loss_mlp": 0.01060547, "balance_loss_clip": 1.03124499, "balance_loss_mlp": 1.04275537, "epoch": 0.05188636705245754, "flos": 24169671751680.0, "grad_norm": 1.8707139286249292, "language_loss": 0.89435291, "learning_rate": 3.9736024348798e-06, "loss": 0.91656297, "num_input_tokens_seen": 18461950, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.171875, "step": 863, "time_per_iteration": 2.467291831970215 }, { "auxiliary_loss_clip": 0.01164169, "auxiliary_loss_mlp": 0.01061407, "balance_loss_clip": 1.03003168, "balance_loss_mlp": 1.04240823, "epoch": 0.051946490305125506, "flos": 26759494028160.0, "grad_norm": 2.655600598569303, "language_loss": 0.75558275, "learning_rate": 3.973541216523316e-06, "loss": 0.77783847, "num_input_tokens_seen": 18480555, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.21875, "step": 864, "time_per_iteration": 2.4739580154418945 }, { "auxiliary_loss_clip": 0.01165025, "auxiliary_loss_mlp": 0.01055548, "balance_loss_clip": 1.02510238, "balance_loss_mlp": 1.04179323, "epoch": 0.05200661355779348, "flos": 21501574473600.0, "grad_norm": 1.9851203173179528, "language_loss": 0.78729963, "learning_rate": 3.973479927736224e-06, "loss": 0.80950534, "num_input_tokens_seen": 18499645, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.234375, "step": 865, "time_per_iteration": 2.481898546218872 }, { "auxiliary_loss_clip": 0.0116096, "auxiliary_loss_mlp": 0.01056058, "balance_loss_clip": 1.02567112, "balance_loss_mlp": 1.03859043, "epoch": 0.05206673681046144, "flos": 18113497804800.0, "grad_norm": 2.155310896039154, "language_loss": 0.85959566, "learning_rate": 3.973418568520709e-06, "loss": 0.88176584, "num_input_tokens_seen": 18516810, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.21875, "step": 866, "time_per_iteration": 2.3983001708984375 }, { "auxiliary_loss_clip": 0.01165422, "auxiliary_loss_mlp": 0.01058009, "balance_loss_clip": 1.02902925, "balance_loss_mlp": 1.04385662, "epoch": 0.052126860063129415, "flos": 17523396138240.0, "grad_norm": 2.813766549619152, "language_loss": 0.87160748, "learning_rate": 3.973357138878961e-06, "loss": 0.89384174, "num_input_tokens_seen": 18532510, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.21875, "step": 867, "time_per_iteration": 2.3934857845306396 }, { "auxiliary_loss_clip": 0.01154742, "auxiliary_loss_mlp": 0.01062666, "balance_loss_clip": 1.03545022, "balance_loss_mlp": 1.0386498, "epoch": 0.05218698331579739, "flos": 32596692727680.0, "grad_norm": 1.4836901961628903, "language_loss": 0.6341002, "learning_rate": 3.973295638813174e-06, "loss": 0.65627426, "num_input_tokens_seen": 18557380, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.15625, "step": 868, "time_per_iteration": 2.5175845623016357 }, { "auxiliary_loss_clip": 0.01165022, "auxiliary_loss_mlp": 0.01062688, "balance_loss_clip": 1.03052521, "balance_loss_mlp": 1.04034626, "epoch": 0.05224710656846535, "flos": 22126205341440.0, "grad_norm": 4.559647925660619, "language_loss": 0.83260775, "learning_rate": 3.973234068325541e-06, "loss": 0.85488486, "num_input_tokens_seen": 18575720, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 1.25, "step": 869, "time_per_iteration": 2.4412894248962402 }, { "auxiliary_loss_clip": 0.01163911, "auxiliary_loss_mlp": 0.01054839, "balance_loss_clip": 1.02630055, "balance_loss_mlp": 1.04024363, "epoch": 0.052307229821133325, "flos": 11144310226560.0, "grad_norm": 2.142193581806339, "language_loss": 0.87373012, "learning_rate": 3.973172427418259e-06, "loss": 0.89591759, "num_input_tokens_seen": 18592185, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.234375, "step": 870, "time_per_iteration": 2.4127585887908936 }, { "auxiliary_loss_clip": 0.01164902, "auxiliary_loss_mlp": 0.01054859, "balance_loss_clip": 1.02622497, "balance_loss_mlp": 1.04133844, "epoch": 0.05236735307380129, "flos": 19127271219840.0, "grad_norm": 2.4752433989170615, "language_loss": 0.80509758, "learning_rate": 3.97311071609353e-06, "loss": 0.82729518, "num_input_tokens_seen": 18609560, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.234375, "step": 871, "time_per_iteration": 2.425719976425171 }, { "auxiliary_loss_clip": 0.01161646, "auxiliary_loss_mlp": 0.01046634, "balance_loss_clip": 1.01799941, "balance_loss_mlp": 1.04076588, "epoch": 0.05242747632646926, "flos": 20959582527360.0, "grad_norm": 2.287777610954603, "language_loss": 0.81213582, "learning_rate": 3.973048934353554e-06, "loss": 0.83421862, "num_input_tokens_seen": 18629405, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.2109375, "step": 872, "time_per_iteration": 2.431184768676758 }, { "auxiliary_loss_clip": 0.01048326, "auxiliary_loss_mlp": 0.01032536, "balance_loss_clip": 1.02783895, "balance_loss_mlp": 1.00866389, "epoch": 0.052487599579137234, "flos": 65017632337920.0, "grad_norm": 0.9004996555808237, "language_loss": 0.61653852, "learning_rate": 3.972987082200538e-06, "loss": 0.63734716, "num_input_tokens_seen": 18681480, "router_z_loss_clip": 0.046875, "router_z_loss_mlp": 0.39648438, "step": 873, "time_per_iteration": 2.950486421585083 }, { "auxiliary_loss_clip": 0.01160999, "auxiliary_loss_mlp": 0.01047396, "balance_loss_clip": 1.01950097, "balance_loss_mlp": 1.03949547, "epoch": 0.0525477228318052, "flos": 23287905653760.0, "grad_norm": 2.1513200790654716, "language_loss": 0.88312685, "learning_rate": 3.972925159636687e-06, "loss": 0.90521085, "num_input_tokens_seen": 18700390, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.21875, "step": 874, "time_per_iteration": 2.431154727935791 }, { "auxiliary_loss_clip": 0.01162635, "auxiliary_loss_mlp": 0.01057579, "balance_loss_clip": 1.02794373, "balance_loss_mlp": 1.03995252, "epoch": 0.05260784608447317, "flos": 32228952710400.0, "grad_norm": 1.788658450643276, "language_loss": 0.74017358, "learning_rate": 3.972863166664212e-06, "loss": 0.76237571, "num_input_tokens_seen": 18721280, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.2265625, "step": 875, "time_per_iteration": 2.5350773334503174 }, { "auxiliary_loss_clip": 0.01161192, "auxiliary_loss_mlp": 0.0105587, "balance_loss_clip": 1.0261867, "balance_loss_mlp": 1.04077697, "epoch": 0.052667969337141136, "flos": 24462034410240.0, "grad_norm": 2.113354624868253, "language_loss": 0.9275443, "learning_rate": 3.972801103285326e-06, "loss": 0.9497149, "num_input_tokens_seen": 18741545, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.203125, "step": 876, "time_per_iteration": 2.459304094314575 }, { "auxiliary_loss_clip": 0.01048974, "auxiliary_loss_mlp": 0.01004426, "balance_loss_clip": 0.99989587, "balance_loss_mlp": 1.01024711, "epoch": 0.05272809258980911, "flos": 57780938989440.0, "grad_norm": 0.8418676385507674, "language_loss": 0.62896293, "learning_rate": 3.9727389695022434e-06, "loss": 0.64949697, "num_input_tokens_seen": 18801400, "router_z_loss_clip": 0.04541016, "router_z_loss_mlp": 0.38671875, "step": 877, "time_per_iteration": 3.1060640811920166 }, { "auxiliary_loss_clip": 0.01162697, "auxiliary_loss_mlp": 0.01058139, "balance_loss_clip": 1.02790785, "balance_loss_mlp": 1.04045391, "epoch": 0.05278821584247708, "flos": 17419843445760.0, "grad_norm": 2.6596222001650593, "language_loss": 0.85823625, "learning_rate": 3.972676765317181e-06, "loss": 0.88044465, "num_input_tokens_seen": 18819670, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.21875, "step": 878, "time_per_iteration": 2.431715726852417 }, { "auxiliary_loss_clip": 0.01163295, "auxiliary_loss_mlp": 0.01053228, "balance_loss_clip": 1.02420092, "balance_loss_mlp": 1.04162955, "epoch": 0.052848339095145046, "flos": 26136154880640.0, "grad_norm": 1.9255771100967056, "language_loss": 0.8295579, "learning_rate": 3.97261449073236e-06, "loss": 0.85172307, "num_input_tokens_seen": 18840580, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.21875, "step": 879, "time_per_iteration": 2.469244956970215 }, { "auxiliary_loss_clip": 0.01158954, "auxiliary_loss_mlp": 0.01061843, "balance_loss_clip": 1.03066969, "balance_loss_mlp": 1.03983855, "epoch": 0.05290846234781302, "flos": 16471148538240.0, "grad_norm": 2.0801283716945176, "language_loss": 0.84291494, "learning_rate": 3.9725521457500005e-06, "loss": 0.86512297, "num_input_tokens_seen": 18859295, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.1875, "step": 880, "time_per_iteration": 2.4176676273345947 }, { "auxiliary_loss_clip": 0.01163223, "auxiliary_loss_mlp": 0.01052989, "balance_loss_clip": 1.02311563, "balance_loss_mlp": 1.03948319, "epoch": 0.05296858560048098, "flos": 19864147708800.0, "grad_norm": 2.2089813172294055, "language_loss": 0.86675858, "learning_rate": 3.97248973037233e-06, "loss": 0.88892066, "num_input_tokens_seen": 18877485, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.234375, "step": 881, "time_per_iteration": 3.8267643451690674 }, { "auxiliary_loss_clip": 0.01160676, "auxiliary_loss_mlp": 0.01053967, "balance_loss_clip": 1.02216184, "balance_loss_mlp": 1.03855371, "epoch": 0.053028708853148955, "flos": 24387460012800.0, "grad_norm": 1.9561697480028104, "language_loss": 0.87807399, "learning_rate": 3.972427244601574e-06, "loss": 0.90022039, "num_input_tokens_seen": 18898275, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 1.21875, "step": 882, "time_per_iteration": 2.4512674808502197 }, { "auxiliary_loss_clip": 0.0116422, "auxiliary_loss_mlp": 0.01053706, "balance_loss_clip": 1.02218688, "balance_loss_mlp": 1.03948426, "epoch": 0.05308883210581693, "flos": 36391681825920.0, "grad_norm": 2.673534550737762, "language_loss": 0.69085759, "learning_rate": 3.972364688439964e-06, "loss": 0.71303678, "num_input_tokens_seen": 18920665, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 1.25, "step": 883, "time_per_iteration": 2.545847177505493 }, { "auxiliary_loss_clip": 0.01158398, "auxiliary_loss_mlp": 0.01055761, "balance_loss_clip": 1.02661395, "balance_loss_mlp": 1.04118681, "epoch": 0.05314895535848489, "flos": 22854039788160.0, "grad_norm": 3.3655480070090205, "language_loss": 0.76206219, "learning_rate": 3.9723020618897325e-06, "loss": 0.78420377, "num_input_tokens_seen": 18939835, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.171875, "step": 884, "time_per_iteration": 2.4771573543548584 }, { "auxiliary_loss_clip": 0.01158304, "auxiliary_loss_mlp": 0.01053657, "balance_loss_clip": 1.02576232, "balance_loss_mlp": 1.04068482, "epoch": 0.053209078611152864, "flos": 12859488322560.0, "grad_norm": 2.1558972408245882, "language_loss": 0.8541072, "learning_rate": 3.972239364953113e-06, "loss": 0.87622678, "num_input_tokens_seen": 18958405, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.1796875, "step": 885, "time_per_iteration": 5.304793834686279 }, { "auxiliary_loss_clip": 0.01160873, "auxiliary_loss_mlp": 0.01058031, "balance_loss_clip": 1.02813315, "balance_loss_mlp": 1.03778291, "epoch": 0.05326920186382083, "flos": 12163844016000.0, "grad_norm": 2.5549299977840665, "language_loss": 0.85519499, "learning_rate": 3.9721765976323435e-06, "loss": 0.87738407, "num_input_tokens_seen": 18975445, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.234375, "step": 886, "time_per_iteration": 2.3972959518432617 }, { "auxiliary_loss_clip": 0.01159147, "auxiliary_loss_mlp": 0.01056786, "balance_loss_clip": 1.02691174, "balance_loss_mlp": 1.0391463, "epoch": 0.0533293251164888, "flos": 22703564361600.0, "grad_norm": 2.058024973151939, "language_loss": 0.88818395, "learning_rate": 3.972113759929665e-06, "loss": 0.91034329, "num_input_tokens_seen": 18991930, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.203125, "step": 887, "time_per_iteration": 3.844373941421509 }, { "auxiliary_loss_clip": 0.01159157, "auxiliary_loss_mlp": 0.01071229, "balance_loss_clip": 1.03975785, "balance_loss_mlp": 1.0372448, "epoch": 0.053389448369156774, "flos": 26939785622400.0, "grad_norm": 1.9006520640832827, "language_loss": 0.74924183, "learning_rate": 3.9720508518473186e-06, "loss": 0.77154565, "num_input_tokens_seen": 19009790, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 1.21875, "step": 888, "time_per_iteration": 2.4814414978027344 }, { "auxiliary_loss_clip": 0.01159249, "auxiliary_loss_mlp": 0.01061684, "balance_loss_clip": 1.0303793, "balance_loss_mlp": 1.03939033, "epoch": 0.05344957162182474, "flos": 25555165079040.0, "grad_norm": 2.078289993135767, "language_loss": 0.88061041, "learning_rate": 3.97198787338755e-06, "loss": 0.90281975, "num_input_tokens_seen": 19030170, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.203125, "step": 889, "time_per_iteration": 2.435617685317993 }, { "auxiliary_loss_clip": 0.01158143, "auxiliary_loss_mlp": 0.01051812, "balance_loss_clip": 1.02282035, "balance_loss_mlp": 1.03846538, "epoch": 0.05350969487449271, "flos": 19718559872640.0, "grad_norm": 2.57841427158968, "language_loss": 0.88126409, "learning_rate": 3.971924824552607e-06, "loss": 0.9033637, "num_input_tokens_seen": 19048075, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.1953125, "step": 890, "time_per_iteration": 2.4183461666107178 }, { "auxiliary_loss_clip": 0.01160598, "auxiliary_loss_mlp": 0.0106087, "balance_loss_clip": 1.03183031, "balance_loss_mlp": 1.03818941, "epoch": 0.053569818127160676, "flos": 27015128069760.0, "grad_norm": 2.234304571881244, "language_loss": 0.93175459, "learning_rate": 3.97186170534474e-06, "loss": 0.95396924, "num_input_tokens_seen": 19067465, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.21875, "step": 891, "time_per_iteration": 2.4461190700531006 }, { "auxiliary_loss_clip": 0.01161554, "auxiliary_loss_mlp": 0.01060467, "balance_loss_clip": 1.02894807, "balance_loss_mlp": 1.03905725, "epoch": 0.05362994137982865, "flos": 13187497345920.0, "grad_norm": 2.2302235095241234, "language_loss": 0.71824193, "learning_rate": 3.9717985157662e-06, "loss": 0.74046212, "num_input_tokens_seen": 19085505, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 1.2265625, "step": 892, "time_per_iteration": 2.4148337841033936 }, { "auxiliary_loss_clip": 0.01162488, "auxiliary_loss_mlp": 0.01068779, "balance_loss_clip": 1.03904855, "balance_loss_mlp": 1.03886676, "epoch": 0.05369006463249662, "flos": 28656744197760.0, "grad_norm": 1.8684471729019887, "language_loss": 0.82398784, "learning_rate": 3.971735255819244e-06, "loss": 0.8463006, "num_input_tokens_seen": 19104360, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.234375, "step": 893, "time_per_iteration": 2.4615511894226074 }, { "auxiliary_loss_clip": 0.01161726, "auxiliary_loss_mlp": 0.01058905, "balance_loss_clip": 1.02868581, "balance_loss_mlp": 1.03838944, "epoch": 0.053750187885164585, "flos": 28911889480320.0, "grad_norm": 2.5652337661280993, "language_loss": 0.81720483, "learning_rate": 3.971671925506129e-06, "loss": 0.83941114, "num_input_tokens_seen": 19124680, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.234375, "step": 894, "time_per_iteration": 2.492638349533081 }, { "auxiliary_loss_clip": 0.01157092, "auxiliary_loss_mlp": 0.01059332, "balance_loss_clip": 1.02929115, "balance_loss_mlp": 1.03694069, "epoch": 0.05381031113783256, "flos": 15157925458560.0, "grad_norm": 3.843311202633951, "language_loss": 0.75092781, "learning_rate": 3.9716085248291125e-06, "loss": 0.77309203, "num_input_tokens_seen": 19142895, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.203125, "step": 895, "time_per_iteration": 2.380563974380493 }, { "auxiliary_loss_clip": 0.0116928, "auxiliary_loss_mlp": 0.01058747, "balance_loss_clip": 1.02970707, "balance_loss_mlp": 1.0438832, "epoch": 0.05387043439050053, "flos": 21834156885120.0, "grad_norm": 2.4046720652318454, "language_loss": 0.86494035, "learning_rate": 3.97154505379046e-06, "loss": 0.88722062, "num_input_tokens_seen": 19163125, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.25, "step": 896, "time_per_iteration": 2.43613338470459 }, { "auxiliary_loss_clip": 0.01164866, "auxiliary_loss_mlp": 0.01054395, "balance_loss_clip": 1.02225626, "balance_loss_mlp": 1.03944147, "epoch": 0.053930557643168495, "flos": 17309378304000.0, "grad_norm": 6.219713565318159, "language_loss": 0.88008451, "learning_rate": 3.971481512392438e-06, "loss": 0.90227711, "num_input_tokens_seen": 19179385, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 1.25, "step": 897, "time_per_iteration": 2.381504535675049 }, { "auxiliary_loss_clip": 0.01159566, "auxiliary_loss_mlp": 0.01061987, "balance_loss_clip": 1.03080177, "balance_loss_mlp": 1.03830087, "epoch": 0.05399068089583647, "flos": 17347503375360.0, "grad_norm": 1.7870846423499203, "language_loss": 0.90078026, "learning_rate": 3.97141790063731e-06, "loss": 0.92299581, "num_input_tokens_seen": 19198725, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 1.21875, "step": 898, "time_per_iteration": 2.3894176483154297 }, { "auxiliary_loss_clip": 0.01164325, "auxiliary_loss_mlp": 0.01069351, "balance_loss_clip": 1.03890502, "balance_loss_mlp": 1.03923023, "epoch": 0.05405080414850443, "flos": 17486178762240.0, "grad_norm": 2.6068244252625465, "language_loss": 0.92166436, "learning_rate": 3.971354218527349e-06, "loss": 0.94400114, "num_input_tokens_seen": 19212380, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.25, "step": 899, "time_per_iteration": 2.374274969100952 }, { "auxiliary_loss_clip": 0.01159553, "auxiliary_loss_mlp": 0.01058724, "balance_loss_clip": 1.02880204, "balance_loss_mlp": 1.03960335, "epoch": 0.054110927401172404, "flos": 24495690827520.0, "grad_norm": 2.03080882462333, "language_loss": 0.75723553, "learning_rate": 3.971290466064827e-06, "loss": 0.77941823, "num_input_tokens_seen": 19232235, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.203125, "step": 900, "time_per_iteration": 2.5287563800811768 }, { "auxiliary_loss_clip": 0.01161827, "auxiliary_loss_mlp": 0.01055122, "balance_loss_clip": 1.0256772, "balance_loss_mlp": 1.03749549, "epoch": 0.054171050653840376, "flos": 22928928387840.0, "grad_norm": 3.188934588277214, "language_loss": 0.73738217, "learning_rate": 3.971226643252019e-06, "loss": 0.7595517, "num_input_tokens_seen": 19251460, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.25, "step": 901, "time_per_iteration": 2.408686876296997 }, { "auxiliary_loss_clip": 0.01154546, "auxiliary_loss_mlp": 0.01061796, "balance_loss_clip": 1.03520036, "balance_loss_mlp": 1.03875566, "epoch": 0.05423117390650834, "flos": 12932352063360.0, "grad_norm": 2.102878927305348, "language_loss": 0.8485086, "learning_rate": 3.971162750091202e-06, "loss": 0.87067199, "num_input_tokens_seen": 19269060, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.15625, "step": 902, "time_per_iteration": 2.3947722911834717 }, { "auxiliary_loss_clip": 0.01158079, "auxiliary_loss_mlp": 0.0105659, "balance_loss_clip": 1.02650177, "balance_loss_mlp": 1.03704238, "epoch": 0.05429129715917631, "flos": 19900317744000.0, "grad_norm": 2.06799905004133, "language_loss": 0.86127782, "learning_rate": 3.971098786584657e-06, "loss": 0.88342452, "num_input_tokens_seen": 19288620, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.2109375, "step": 903, "time_per_iteration": 2.4634947776794434 }, { "auxiliary_loss_clip": 0.01156319, "auxiliary_loss_mlp": 0.01050202, "balance_loss_clip": 1.02140117, "balance_loss_mlp": 1.03751755, "epoch": 0.05435142041184428, "flos": 16907702578560.0, "grad_norm": 2.4141723378563067, "language_loss": 0.75000405, "learning_rate": 3.971034752734668e-06, "loss": 0.77206928, "num_input_tokens_seen": 19306615, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.1875, "step": 904, "time_per_iteration": 2.384925365447998 }, { "auxiliary_loss_clip": 0.01162563, "auxiliary_loss_mlp": 0.01054817, "balance_loss_clip": 1.02602839, "balance_loss_mlp": 1.04079747, "epoch": 0.05441154366451225, "flos": 23947275191040.0, "grad_norm": 2.6027161513202386, "language_loss": 0.85758334, "learning_rate": 3.970970648543517e-06, "loss": 0.87975711, "num_input_tokens_seen": 19321680, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.21875, "step": 905, "time_per_iteration": 2.423539161682129 }, { "auxiliary_loss_clip": 0.01157356, "auxiliary_loss_mlp": 0.01053976, "balance_loss_clip": 1.02722573, "balance_loss_mlp": 1.0414567, "epoch": 0.05447166691718022, "flos": 19974333559680.0, "grad_norm": 3.568337735308201, "language_loss": 0.74576402, "learning_rate": 3.970906474013494e-06, "loss": 0.76787734, "num_input_tokens_seen": 19339760, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 1.15625, "step": 906, "time_per_iteration": 2.4099528789520264 }, { "auxiliary_loss_clip": 0.0116229, "auxiliary_loss_mlp": 0.01055882, "balance_loss_clip": 1.02797496, "balance_loss_mlp": 1.03734863, "epoch": 0.05453179016984819, "flos": 24935351978880.0, "grad_norm": 1.9587767139692178, "language_loss": 0.86923331, "learning_rate": 3.97084222914689e-06, "loss": 0.891415, "num_input_tokens_seen": 19359585, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.25, "step": 907, "time_per_iteration": 2.467411994934082 }, { "auxiliary_loss_clip": 0.0116156, "auxiliary_loss_mlp": 0.01063412, "balance_loss_clip": 1.03406227, "balance_loss_mlp": 1.04133844, "epoch": 0.05459191342251616, "flos": 18114091297920.0, "grad_norm": 3.51243039724905, "language_loss": 0.86991906, "learning_rate": 3.970777913945995e-06, "loss": 0.89216876, "num_input_tokens_seen": 19378590, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.203125, "step": 908, "time_per_iteration": 2.421147584915161 }, { "auxiliary_loss_clip": 0.01160336, "auxiliary_loss_mlp": 0.01057968, "balance_loss_clip": 1.02871394, "balance_loss_mlp": 1.04014111, "epoch": 0.054652036675184125, "flos": 19207291789440.0, "grad_norm": 2.1778556300511402, "language_loss": 0.89483535, "learning_rate": 3.970713528413106e-06, "loss": 0.91701841, "num_input_tokens_seen": 19397910, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.203125, "step": 909, "time_per_iteration": 2.433941602706909 }, { "auxiliary_loss_clip": 0.01161949, "auxiliary_loss_mlp": 0.01063548, "balance_loss_clip": 1.03183877, "balance_loss_mlp": 1.03985989, "epoch": 0.0547121599278521, "flos": 16324827563520.0, "grad_norm": 4.50022788736104, "language_loss": 0.71124053, "learning_rate": 3.9706490725505205e-06, "loss": 0.73349547, "num_input_tokens_seen": 19415950, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 1.21875, "step": 910, "time_per_iteration": 2.3976662158966064 }, { "auxiliary_loss_clip": 0.01156027, "auxiliary_loss_mlp": 0.01053817, "balance_loss_clip": 1.02468252, "balance_loss_mlp": 1.0386641, "epoch": 0.05477228318052007, "flos": 20337988947840.0, "grad_norm": 1.814052792527829, "language_loss": 0.83245134, "learning_rate": 3.970584546360539e-06, "loss": 0.85454977, "num_input_tokens_seen": 19435275, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.171875, "step": 911, "time_per_iteration": 2.4349710941314697 }, { "auxiliary_loss_clip": 0.01159004, "auxiliary_loss_mlp": 0.01054041, "balance_loss_clip": 1.02214098, "balance_loss_mlp": 1.03775454, "epoch": 0.054832406433188034, "flos": 21972238778880.0, "grad_norm": 3.1255396403293156, "language_loss": 0.75924587, "learning_rate": 3.970519949845464e-06, "loss": 0.78137636, "num_input_tokens_seen": 19452090, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 1.2109375, "step": 912, "time_per_iteration": 2.39033842086792 }, { "auxiliary_loss_clip": 0.01155644, "auxiliary_loss_mlp": 0.01056618, "balance_loss_clip": 1.02753103, "balance_loss_mlp": 1.03938627, "epoch": 0.054892529685856006, "flos": 16398005506560.0, "grad_norm": 2.468181012949965, "language_loss": 0.82650316, "learning_rate": 3.9704552830076005e-06, "loss": 0.84862584, "num_input_tokens_seen": 19470865, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.15625, "step": 913, "time_per_iteration": 2.4073081016540527 }, { "auxiliary_loss_clip": 0.01157295, "auxiliary_loss_mlp": 0.01052201, "balance_loss_clip": 1.02399659, "balance_loss_mlp": 1.04099143, "epoch": 0.05495265293852397, "flos": 23911279712640.0, "grad_norm": 2.064636756581716, "language_loss": 0.8323791, "learning_rate": 3.9703905458492564e-06, "loss": 0.85447407, "num_input_tokens_seen": 19492145, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.15625, "step": 914, "time_per_iteration": 2.4616618156433105 }, { "auxiliary_loss_clip": 0.01161332, "auxiliary_loss_mlp": 0.01057481, "balance_loss_clip": 1.02823925, "balance_loss_mlp": 1.04124427, "epoch": 0.055012776191191944, "flos": 23585819218560.0, "grad_norm": 3.304359884839736, "language_loss": 0.8976059, "learning_rate": 3.970325738372742e-06, "loss": 0.91979396, "num_input_tokens_seen": 19511015, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.203125, "step": 915, "time_per_iteration": 2.444795608520508 }, { "auxiliary_loss_clip": 0.01158095, "auxiliary_loss_mlp": 0.01058386, "balance_loss_clip": 1.03014517, "balance_loss_mlp": 1.03939843, "epoch": 0.055072899443859916, "flos": 17527585501440.0, "grad_norm": 1.733987534023442, "language_loss": 0.89628351, "learning_rate": 3.970260860580371e-06, "loss": 0.91844833, "num_input_tokens_seen": 19529040, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.1875, "step": 916, "time_per_iteration": 2.412970542907715 }, { "auxiliary_loss_clip": 0.01160748, "auxiliary_loss_mlp": 0.01062094, "balance_loss_clip": 1.03136241, "balance_loss_mlp": 1.03984213, "epoch": 0.05513302269652788, "flos": 21686160165120.0, "grad_norm": 4.549710784254671, "language_loss": 0.79854846, "learning_rate": 3.970195912474457e-06, "loss": 0.82077694, "num_input_tokens_seen": 19549540, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 1.2109375, "step": 917, "time_per_iteration": 2.4622411727905273 }, { "auxiliary_loss_clip": 0.0116097, "auxiliary_loss_mlp": 0.01055916, "balance_loss_clip": 1.02758026, "balance_loss_mlp": 1.03956866, "epoch": 0.05519314594919585, "flos": 21612353817600.0, "grad_norm": 2.0877874916789505, "language_loss": 0.79856837, "learning_rate": 3.9701308940573195e-06, "loss": 0.82073724, "num_input_tokens_seen": 19567570, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.2109375, "step": 918, "time_per_iteration": 2.42089581489563 }, { "auxiliary_loss_clip": 0.01158156, "auxiliary_loss_mlp": 0.01048415, "balance_loss_clip": 1.0192802, "balance_loss_mlp": 1.03866529, "epoch": 0.05525326920186382, "flos": 21797498090880.0, "grad_norm": 1.8702226836959464, "language_loss": 0.88963503, "learning_rate": 3.970065805331279e-06, "loss": 0.91170073, "num_input_tokens_seen": 19585330, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.1953125, "step": 919, "time_per_iteration": 2.4145870208740234 }, { "auxiliary_loss_clip": 0.01155088, "auxiliary_loss_mlp": 0.01054277, "balance_loss_clip": 1.02604783, "balance_loss_mlp": 1.0378617, "epoch": 0.05531339245453179, "flos": 28438362443520.0, "grad_norm": 2.341967609968428, "language_loss": 0.86990917, "learning_rate": 3.970000646298656e-06, "loss": 0.89200282, "num_input_tokens_seen": 19604970, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.171875, "step": 920, "time_per_iteration": 2.4621634483337402 }, { "auxiliary_loss_clip": 0.0115865, "auxiliary_loss_mlp": 0.01054321, "balance_loss_clip": 1.02559161, "balance_loss_mlp": 1.03921771, "epoch": 0.05537351570719976, "flos": 37373718948480.0, "grad_norm": 2.1703236391249847, "language_loss": 0.65769506, "learning_rate": 3.969935416961778e-06, "loss": 0.67982477, "num_input_tokens_seen": 19626235, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.1953125, "step": 921, "time_per_iteration": 3.9598336219787598 }, { "auxiliary_loss_clip": 0.01163449, "auxiliary_loss_mlp": 0.01055815, "balance_loss_clip": 1.0239861, "balance_loss_mlp": 1.04239345, "epoch": 0.05543363895986773, "flos": 20083437158400.0, "grad_norm": 4.7013371739042045, "language_loss": 0.71693504, "learning_rate": 3.969870117322973e-06, "loss": 0.7391277, "num_input_tokens_seen": 19644305, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 1.2109375, "step": 922, "time_per_iteration": 2.426201581954956 }, { "auxiliary_loss_clip": 0.01161821, "auxiliary_loss_mlp": 0.01062252, "balance_loss_clip": 1.03109097, "balance_loss_mlp": 1.03957999, "epoch": 0.0554937622125357, "flos": 24532105242240.0, "grad_norm": 2.542450463504702, "language_loss": 0.82041645, "learning_rate": 3.96980474738457e-06, "loss": 0.84265721, "num_input_tokens_seen": 19662130, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.21875, "step": 923, "time_per_iteration": 2.434980869293213 }, { "auxiliary_loss_clip": 0.01159792, "auxiliary_loss_mlp": 0.01056812, "balance_loss_clip": 1.02710509, "balance_loss_mlp": 1.0386827, "epoch": 0.055553885465203665, "flos": 14319172022400.0, "grad_norm": 2.0976337306139627, "language_loss": 0.78356576, "learning_rate": 3.969739307148902e-06, "loss": 0.80573177, "num_input_tokens_seen": 19680715, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.2109375, "step": 924, "time_per_iteration": 2.3935599327087402 }, { "auxiliary_loss_clip": 0.01158258, "auxiliary_loss_mlp": 0.01051702, "balance_loss_clip": 1.0231396, "balance_loss_mlp": 1.03932667, "epoch": 0.05561400871787164, "flos": 27379900621440.0, "grad_norm": 1.9906794558315535, "language_loss": 1.0172838, "learning_rate": 3.969673796618306e-06, "loss": 1.03938341, "num_input_tokens_seen": 19700535, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.1875, "step": 925, "time_per_iteration": 5.295424461364746 }, { "auxiliary_loss_clip": 0.01158902, "auxiliary_loss_mlp": 0.01047981, "balance_loss_clip": 1.01815462, "balance_loss_mlp": 1.03811073, "epoch": 0.05567413197053961, "flos": 23219999326080.0, "grad_norm": 1.9889175308308498, "language_loss": 0.80755478, "learning_rate": 3.969608215795117e-06, "loss": 0.82962364, "num_input_tokens_seen": 19718825, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.203125, "step": 926, "time_per_iteration": 2.4128670692443848 }, { "auxiliary_loss_clip": 0.01166394, "auxiliary_loss_mlp": 0.01050217, "balance_loss_clip": 1.01919866, "balance_loss_mlp": 1.04073811, "epoch": 0.055734255223207574, "flos": 25263779938560.0, "grad_norm": 2.2196935082358995, "language_loss": 0.7284615, "learning_rate": 3.969542564681679e-06, "loss": 0.75062764, "num_input_tokens_seen": 19739080, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 1.2578125, "step": 927, "time_per_iteration": 3.88985538482666 }, { "auxiliary_loss_clip": 0.01047593, "auxiliary_loss_mlp": 0.01003839, "balance_loss_clip": 0.99959511, "balance_loss_mlp": 1.00975871, "epoch": 0.055794378475875546, "flos": 66499519662720.0, "grad_norm": 0.7940120874990624, "language_loss": 0.59834445, "learning_rate": 3.969476843280333e-06, "loss": 0.61885875, "num_input_tokens_seen": 19802960, "router_z_loss_clip": 0.04248047, "router_z_loss_mlp": 0.37890625, "step": 928, "time_per_iteration": 3.073519706726074 }, { "auxiliary_loss_clip": 0.01161437, "auxiliary_loss_mlp": 0.01059613, "balance_loss_clip": 1.02945352, "balance_loss_mlp": 1.04066682, "epoch": 0.05585450172854351, "flos": 25336469122560.0, "grad_norm": 2.741727701465678, "language_loss": 0.94735438, "learning_rate": 3.969411051593424e-06, "loss": 0.96956486, "num_input_tokens_seen": 19822765, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.203125, "step": 929, "time_per_iteration": 2.4511215686798096 }, { "auxiliary_loss_clip": 0.01159328, "auxiliary_loss_mlp": 0.01056288, "balance_loss_clip": 1.02419722, "balance_loss_mlp": 1.03711009, "epoch": 0.05591462498121148, "flos": 33910334743680.0, "grad_norm": 2.0145312235320367, "language_loss": 0.71520591, "learning_rate": 3.9693451896233e-06, "loss": 0.73736215, "num_input_tokens_seen": 19843590, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 1.21875, "step": 930, "time_per_iteration": 2.519730567932129 }, { "auxiliary_loss_clip": 0.01162948, "auxiliary_loss_mlp": 0.01054579, "balance_loss_clip": 1.02428746, "balance_loss_mlp": 1.04021239, "epoch": 0.055974748233879455, "flos": 17929924542720.0, "grad_norm": 7.092155019963012, "language_loss": 0.84803557, "learning_rate": 3.969279257372313e-06, "loss": 0.87021089, "num_input_tokens_seen": 19860230, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.2265625, "step": 931, "time_per_iteration": 2.387993097305298 }, { "auxiliary_loss_clip": 0.01163628, "auxiliary_loss_mlp": 0.01061806, "balance_loss_clip": 1.03116918, "balance_loss_mlp": 1.03927064, "epoch": 0.05603487148654742, "flos": 24020906981760.0, "grad_norm": 1.7785238580422558, "language_loss": 0.83289844, "learning_rate": 3.969213254842814e-06, "loss": 0.85515279, "num_input_tokens_seen": 19880795, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.25, "step": 932, "time_per_iteration": 2.460404634475708 }, { "auxiliary_loss_clip": 0.01163613, "auxiliary_loss_mlp": 0.01056253, "balance_loss_clip": 1.02413821, "balance_loss_mlp": 1.04108763, "epoch": 0.05609499473921539, "flos": 17306899597440.0, "grad_norm": 2.233640975121556, "language_loss": 0.73572028, "learning_rate": 3.9691471820371594e-06, "loss": 0.75791895, "num_input_tokens_seen": 19897960, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 1.2265625, "step": 933, "time_per_iteration": 2.430478096008301 }, { "auxiliary_loss_clip": 0.0115802, "auxiliary_loss_mlp": 0.01060328, "balance_loss_clip": 1.02950001, "balance_loss_mlp": 1.03795409, "epoch": 0.05615511799188336, "flos": 20993727703680.0, "grad_norm": 2.738979518074269, "language_loss": 0.86471808, "learning_rate": 3.969081038957708e-06, "loss": 0.88690156, "num_input_tokens_seen": 19913315, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.203125, "step": 934, "time_per_iteration": 2.3857738971710205 }, { "auxiliary_loss_clip": 0.01153999, "auxiliary_loss_mlp": 0.01058784, "balance_loss_clip": 1.03041255, "balance_loss_mlp": 1.03940582, "epoch": 0.05621524124455133, "flos": 17272614775680.0, "grad_norm": 2.1381230663333164, "language_loss": 0.80012619, "learning_rate": 3.969014825606819e-06, "loss": 0.82225406, "num_input_tokens_seen": 19928790, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.1484375, "step": 935, "time_per_iteration": 2.3873655796051025 }, { "auxiliary_loss_clip": 0.01043219, "auxiliary_loss_mlp": 0.01005184, "balance_loss_clip": 1.00051081, "balance_loss_mlp": 1.00608182, "epoch": 0.0562753644972193, "flos": 58716332668800.0, "grad_norm": 0.8227501343542768, "language_loss": 0.69188774, "learning_rate": 3.968948541986855e-06, "loss": 0.71237177, "num_input_tokens_seen": 19988785, "router_z_loss_clip": 0.04663086, "router_z_loss_mlp": 0.37109375, "step": 936, "time_per_iteration": 2.932800531387329 }, { "auxiliary_loss_clip": 0.01158892, "auxiliary_loss_mlp": 0.01053648, "balance_loss_clip": 1.02417982, "balance_loss_mlp": 1.03832948, "epoch": 0.05633548774988727, "flos": 17456083303680.0, "grad_norm": 2.9910462478789834, "language_loss": 0.75406981, "learning_rate": 3.968882188100183e-06, "loss": 0.77619517, "num_input_tokens_seen": 20007685, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.203125, "step": 937, "time_per_iteration": 2.412381410598755 }, { "auxiliary_loss_clip": 0.01042286, "auxiliary_loss_mlp": 0.01003793, "balance_loss_clip": 0.99935871, "balance_loss_mlp": 1.00569856, "epoch": 0.05639561100255524, "flos": 70651426256640.0, "grad_norm": 0.8602694962622135, "language_loss": 0.64379501, "learning_rate": 3.9688157639491704e-06, "loss": 0.66425586, "num_input_tokens_seen": 20072750, "router_z_loss_clip": 0.04443359, "router_z_loss_mlp": 0.3671875, "step": 938, "time_per_iteration": 3.023224353790283 }, { "auxiliary_loss_clip": 0.01166904, "auxiliary_loss_mlp": 0.01058033, "balance_loss_clip": 1.02677608, "balance_loss_mlp": 1.03905725, "epoch": 0.056455734255223204, "flos": 20484938327040.0, "grad_norm": 2.812712479682215, "language_loss": 0.79116201, "learning_rate": 3.968749269536188e-06, "loss": 0.81341136, "num_input_tokens_seen": 20089070, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.28125, "step": 939, "time_per_iteration": 2.4041600227355957 }, { "auxiliary_loss_clip": 0.01158136, "auxiliary_loss_mlp": 0.01054269, "balance_loss_clip": 1.02573061, "balance_loss_mlp": 1.03815985, "epoch": 0.056515857507891176, "flos": 22052503728000.0, "grad_norm": 1.798056398583246, "language_loss": 0.73791158, "learning_rate": 3.9686827048636074e-06, "loss": 0.76003563, "num_input_tokens_seen": 20108790, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.203125, "step": 940, "time_per_iteration": 2.4448323249816895 }, { "auxiliary_loss_clip": 0.01159927, "auxiliary_loss_mlp": 0.01061686, "balance_loss_clip": 1.03264642, "balance_loss_mlp": 1.04005516, "epoch": 0.05657598076055915, "flos": 24024153738240.0, "grad_norm": 1.873028528359329, "language_loss": 0.70337206, "learning_rate": 3.968616069933806e-06, "loss": 0.7255882, "num_input_tokens_seen": 20128455, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.203125, "step": 941, "time_per_iteration": 2.443727731704712 }, { "auxiliary_loss_clip": 0.01156185, "auxiliary_loss_mlp": 0.01053917, "balance_loss_clip": 1.02378058, "balance_loss_mlp": 1.03895378, "epoch": 0.05663610401322711, "flos": 20479701623040.0, "grad_norm": 1.8997356430322379, "language_loss": 0.806705, "learning_rate": 3.96854936474916e-06, "loss": 0.82880604, "num_input_tokens_seen": 20145775, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.171875, "step": 942, "time_per_iteration": 2.40793514251709 }, { "auxiliary_loss_clip": 0.01156782, "auxiliary_loss_mlp": 0.01056196, "balance_loss_clip": 1.02615571, "balance_loss_mlp": 1.03941846, "epoch": 0.056696227265895086, "flos": 21067987898880.0, "grad_norm": 2.2031773806584423, "language_loss": 0.880005, "learning_rate": 3.968482589312052e-06, "loss": 0.90213478, "num_input_tokens_seen": 20164315, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.171875, "step": 943, "time_per_iteration": 2.4251973628997803 }, { "auxiliary_loss_clip": 0.01160592, "auxiliary_loss_mlp": 0.01055373, "balance_loss_clip": 1.02628577, "balance_loss_mlp": 1.04070008, "epoch": 0.05675635051856306, "flos": 17820367096320.0, "grad_norm": 2.2903411620696725, "language_loss": 0.74629074, "learning_rate": 3.968415743624863e-06, "loss": 0.76845038, "num_input_tokens_seen": 20182760, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.1953125, "step": 944, "time_per_iteration": 2.4026761054992676 }, { "auxiliary_loss_clip": 0.01155027, "auxiliary_loss_mlp": 0.01055523, "balance_loss_clip": 1.02712703, "balance_loss_mlp": 1.03648019, "epoch": 0.05681647377123102, "flos": 23113758458880.0, "grad_norm": 1.5914147172454032, "language_loss": 0.79131436, "learning_rate": 3.9683488276899794e-06, "loss": 0.81341994, "num_input_tokens_seen": 20203830, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.1875, "step": 945, "time_per_iteration": 2.5026559829711914 }, { "auxiliary_loss_clip": 0.01158836, "auxiliary_loss_mlp": 0.01053737, "balance_loss_clip": 1.02420902, "balance_loss_mlp": 1.03774977, "epoch": 0.056876597023898995, "flos": 16069612458240.0, "grad_norm": 2.2531418065513664, "language_loss": 0.82614088, "learning_rate": 3.96828184150979e-06, "loss": 0.8482666, "num_input_tokens_seen": 20220365, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.2109375, "step": 946, "time_per_iteration": 2.393521785736084 }, { "auxiliary_loss_clip": 0.01164953, "auxiliary_loss_mlp": 0.01056321, "balance_loss_clip": 1.0244925, "balance_loss_mlp": 1.04142892, "epoch": 0.05693672027656696, "flos": 16834734103680.0, "grad_norm": 1.9265319793675204, "language_loss": 0.79115474, "learning_rate": 3.968214785086684e-06, "loss": 0.81336749, "num_input_tokens_seen": 20238640, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 1.234375, "step": 947, "time_per_iteration": 2.4404330253601074 }, { "auxiliary_loss_clip": 0.01163781, "auxiliary_loss_mlp": 0.01061873, "balance_loss_clip": 1.0308311, "balance_loss_mlp": 1.04187322, "epoch": 0.05699684352923493, "flos": 21388281511680.0, "grad_norm": 3.7219270590303255, "language_loss": 0.8536315, "learning_rate": 3.968147658423056e-06, "loss": 0.87588805, "num_input_tokens_seen": 20251025, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 1.21875, "step": 948, "time_per_iteration": 2.3871397972106934 }, { "auxiliary_loss_clip": 0.01161505, "auxiliary_loss_mlp": 0.01060613, "balance_loss_clip": 1.0270915, "balance_loss_mlp": 1.04082263, "epoch": 0.057056966781902904, "flos": 15559391715840.0, "grad_norm": 1.8594148227814742, "language_loss": 0.87232089, "learning_rate": 3.9680804615213e-06, "loss": 0.8945421, "num_input_tokens_seen": 20269775, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 1.203125, "step": 949, "time_per_iteration": 2.4071102142333984 }, { "auxiliary_loss_clip": 0.01155606, "auxiliary_loss_mlp": 0.01054039, "balance_loss_clip": 1.02651358, "balance_loss_mlp": 1.03915536, "epoch": 0.05711709003457087, "flos": 19936836892800.0, "grad_norm": 2.011604601070385, "language_loss": 0.78427905, "learning_rate": 3.968013194383815e-06, "loss": 0.8063755, "num_input_tokens_seen": 20287715, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.1640625, "step": 950, "time_per_iteration": 2.3976221084594727 }, { "auxiliary_loss_clip": 0.0116531, "auxiliary_loss_mlp": 0.01060047, "balance_loss_clip": 1.03012562, "balance_loss_mlp": 1.04249465, "epoch": 0.05717721328723884, "flos": 30331493072640.0, "grad_norm": 2.233275216295547, "language_loss": 0.82126546, "learning_rate": 3.967945857013002e-06, "loss": 0.84351903, "num_input_tokens_seen": 20307070, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.2265625, "step": 951, "time_per_iteration": 2.5157175064086914 }, { "auxiliary_loss_clip": 0.01157948, "auxiliary_loss_mlp": 0.0106173, "balance_loss_clip": 1.03192782, "balance_loss_mlp": 1.03772712, "epoch": 0.05723733653990681, "flos": 23653376432640.0, "grad_norm": 2.473968384898655, "language_loss": 0.86654651, "learning_rate": 3.967878449411263e-06, "loss": 0.88874328, "num_input_tokens_seen": 20324945, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.203125, "step": 952, "time_per_iteration": 2.41511607170105 }, { "auxiliary_loss_clip": 0.0116263, "auxiliary_loss_mlp": 0.01060063, "balance_loss_clip": 1.02770972, "balance_loss_mlp": 1.03811193, "epoch": 0.05729745979257478, "flos": 22054633320960.0, "grad_norm": 1.921396913652021, "language_loss": 0.79379117, "learning_rate": 3.967810971581004e-06, "loss": 0.81601816, "num_input_tokens_seen": 20346135, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 1.2421875, "step": 953, "time_per_iteration": 2.4393908977508545 }, { "auxiliary_loss_clip": 0.01162789, "auxiliary_loss_mlp": 0.01057679, "balance_loss_clip": 1.0272572, "balance_loss_mlp": 1.04243231, "epoch": 0.05735758304524275, "flos": 19603486431360.0, "grad_norm": 2.1109943663128177, "language_loss": 0.86476898, "learning_rate": 3.967743423524633e-06, "loss": 0.88697374, "num_input_tokens_seen": 20364450, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.203125, "step": 954, "time_per_iteration": 2.4184181690216064 }, { "auxiliary_loss_clip": 0.01162818, "auxiliary_loss_mlp": 0.01056128, "balance_loss_clip": 1.02434659, "balance_loss_mlp": 1.040133, "epoch": 0.057417706297910716, "flos": 19098013633920.0, "grad_norm": 2.4544342404404973, "language_loss": 0.87799019, "learning_rate": 3.967675805244562e-06, "loss": 0.90017962, "num_input_tokens_seen": 20383500, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 1.2265625, "step": 955, "time_per_iteration": 2.424464225769043 }, { "auxiliary_loss_clip": 0.01158728, "auxiliary_loss_mlp": 0.01057114, "balance_loss_clip": 1.0270493, "balance_loss_mlp": 1.03928959, "epoch": 0.05747782955057869, "flos": 16653569725440.0, "grad_norm": 2.2318971564074923, "language_loss": 0.89087892, "learning_rate": 3.967608116743202e-06, "loss": 0.9130373, "num_input_tokens_seen": 20400295, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.1953125, "step": 956, "time_per_iteration": 2.369446039199829 }, { "auxiliary_loss_clip": 0.01159596, "auxiliary_loss_mlp": 0.01056179, "balance_loss_clip": 1.02831984, "balance_loss_mlp": 1.0415616, "epoch": 0.05753795280324665, "flos": 14501174273280.0, "grad_norm": 2.8091922156104077, "language_loss": 0.75586867, "learning_rate": 3.96754035802297e-06, "loss": 0.7780264, "num_input_tokens_seen": 20419085, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.1796875, "step": 957, "time_per_iteration": 2.4029459953308105 }, { "auxiliary_loss_clip": 0.01164806, "auxiliary_loss_mlp": 0.01067315, "balance_loss_clip": 1.03374577, "balance_loss_mlp": 1.04087114, "epoch": 0.057598076055914625, "flos": 18075372733440.0, "grad_norm": 2.0366571588287563, "language_loss": 0.79876363, "learning_rate": 3.967472529086284e-06, "loss": 0.82108486, "num_input_tokens_seen": 20437465, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 1.234375, "step": 958, "time_per_iteration": 2.418729066848755 }, { "auxiliary_loss_clip": 0.01158242, "auxiliary_loss_mlp": 0.01049047, "balance_loss_clip": 1.02119946, "balance_loss_mlp": 1.03792882, "epoch": 0.0576581993085826, "flos": 22123586989440.0, "grad_norm": 2.6994391636694663, "language_loss": 0.88083041, "learning_rate": 3.967404629935564e-06, "loss": 0.90290332, "num_input_tokens_seen": 20456235, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.203125, "step": 959, "time_per_iteration": 2.4110212326049805 }, { "auxiliary_loss_clip": 0.01153443, "auxiliary_loss_mlp": 0.01049098, "balance_loss_clip": 1.01970053, "balance_loss_mlp": 1.03854108, "epoch": 0.05771832256125056, "flos": 33180370704000.0, "grad_norm": 10.099762558081204, "language_loss": 0.7851907, "learning_rate": 3.9673366605732335e-06, "loss": 0.80721611, "num_input_tokens_seen": 20476825, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.1484375, "step": 960, "time_per_iteration": 2.5175628662109375 }, { "auxiliary_loss_clip": 0.01158291, "auxiliary_loss_mlp": 0.01052655, "balance_loss_clip": 1.0229001, "balance_loss_mlp": 1.03849053, "epoch": 0.057778445813918534, "flos": 24169008435840.0, "grad_norm": 2.056220829596911, "language_loss": 0.93077898, "learning_rate": 3.967268621001718e-06, "loss": 0.95288843, "num_input_tokens_seen": 20496965, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.1953125, "step": 961, "time_per_iteration": 3.873457193374634 }, { "auxiliary_loss_clip": 0.01158597, "auxiliary_loss_mlp": 0.01059004, "balance_loss_clip": 1.02731872, "balance_loss_mlp": 1.03795218, "epoch": 0.0578385690665865, "flos": 29641748785920.0, "grad_norm": 2.839739402560139, "language_loss": 0.68123364, "learning_rate": 3.967200511223446e-06, "loss": 0.70340973, "num_input_tokens_seen": 20518035, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 1.2109375, "step": 962, "time_per_iteration": 2.462498664855957 }, { "auxiliary_loss_clip": 0.01158783, "auxiliary_loss_mlp": 0.01056919, "balance_loss_clip": 1.02739072, "balance_loss_mlp": 1.04080355, "epoch": 0.05789869231925447, "flos": 20884414636800.0, "grad_norm": 2.6681272796667352, "language_loss": 0.88147473, "learning_rate": 3.967132331240848e-06, "loss": 0.90363169, "num_input_tokens_seen": 20534740, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.1796875, "step": 963, "time_per_iteration": 2.4187171459198 }, { "auxiliary_loss_clip": 0.01162174, "auxiliary_loss_mlp": 0.01047752, "balance_loss_clip": 1.01761627, "balance_loss_mlp": 1.04180944, "epoch": 0.057958815571922444, "flos": 26029914013440.0, "grad_norm": 2.1959636755272665, "language_loss": 0.8503716, "learning_rate": 3.9670640810563575e-06, "loss": 0.87247086, "num_input_tokens_seen": 20553485, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.203125, "step": 964, "time_per_iteration": 3.964810609817505 }, { "auxiliary_loss_clip": 0.01157798, "auxiliary_loss_mlp": 0.01061291, "balance_loss_clip": 1.03020108, "balance_loss_mlp": 1.04076898, "epoch": 0.05801893882459041, "flos": 18076699365120.0, "grad_norm": 2.6028477809575405, "language_loss": 0.77876091, "learning_rate": 3.96699576067241e-06, "loss": 0.80095172, "num_input_tokens_seen": 20572155, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.171875, "step": 965, "time_per_iteration": 3.8165664672851562 }, { "auxiliary_loss_clip": 0.0115381, "auxiliary_loss_mlp": 0.01051914, "balance_loss_clip": 1.02453136, "balance_loss_mlp": 1.03798401, "epoch": 0.05807906207725838, "flos": 17747922291840.0, "grad_norm": 2.042103237634218, "language_loss": 0.81013924, "learning_rate": 3.966927370091442e-06, "loss": 0.83219647, "num_input_tokens_seen": 20590395, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.15625, "step": 966, "time_per_iteration": 2.4498939514160156 }, { "auxiliary_loss_clip": 0.01156764, "auxiliary_loss_mlp": 0.01054726, "balance_loss_clip": 1.0254128, "balance_loss_mlp": 1.03877592, "epoch": 0.058139185329926346, "flos": 18039412166400.0, "grad_norm": 1.9158014889501103, "language_loss": 0.76398164, "learning_rate": 3.9668589093158975e-06, "loss": 0.78609657, "num_input_tokens_seen": 20608435, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.1796875, "step": 967, "time_per_iteration": 3.8259549140930176 }, { "auxiliary_loss_clip": 0.01040902, "auxiliary_loss_mlp": 0.01004826, "balance_loss_clip": 1.00084472, "balance_loss_mlp": 1.00524974, "epoch": 0.05819930858259432, "flos": 62360287758720.0, "grad_norm": 1.1378855262618766, "language_loss": 0.57294559, "learning_rate": 3.966790378348217e-06, "loss": 0.59340286, "num_input_tokens_seen": 20668575, "router_z_loss_clip": 0.03979492, "router_z_loss_mlp": 0.35546875, "step": 968, "time_per_iteration": 2.9690871238708496 }, { "auxiliary_loss_clip": 0.01163325, "auxiliary_loss_mlp": 0.01056222, "balance_loss_clip": 1.0261575, "balance_loss_mlp": 1.04513001, "epoch": 0.05825943183526229, "flos": 19134358225920.0, "grad_norm": 1.995117496071628, "language_loss": 0.82372129, "learning_rate": 3.966721777190847e-06, "loss": 0.84591675, "num_input_tokens_seen": 20687355, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.1796875, "step": 969, "time_per_iteration": 2.413395643234253 }, { "auxiliary_loss_clip": 0.01158774, "auxiliary_loss_mlp": 0.0105649, "balance_loss_clip": 1.02590108, "balance_loss_mlp": 1.03951955, "epoch": 0.058319555087930255, "flos": 29021202547200.0, "grad_norm": 2.6883084318848938, "language_loss": 0.78030252, "learning_rate": 3.966653105846237e-06, "loss": 0.80245519, "num_input_tokens_seen": 20705710, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 1.1953125, "step": 970, "time_per_iteration": 2.459301710128784 }, { "auxiliary_loss_clip": 0.0116011, "auxiliary_loss_mlp": 0.01059005, "balance_loss_clip": 1.02681899, "balance_loss_mlp": 1.03973424, "epoch": 0.05837967834059823, "flos": 18879003475200.0, "grad_norm": 2.4646959244174207, "language_loss": 0.92010236, "learning_rate": 3.966584364316835e-06, "loss": 0.94229347, "num_input_tokens_seen": 20722405, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 1.203125, "step": 971, "time_per_iteration": 2.41672945022583 }, { "auxiliary_loss_clip": 0.01153373, "auxiliary_loss_mlp": 0.01046757, "balance_loss_clip": 1.01958954, "balance_loss_mlp": 1.03688502, "epoch": 0.05843980159326619, "flos": 25701870078720.0, "grad_norm": 1.82764836766903, "language_loss": 0.85977405, "learning_rate": 3.966515552605096e-06, "loss": 0.88177538, "num_input_tokens_seen": 20741480, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.1640625, "step": 972, "time_per_iteration": 2.4400033950805664 }, { "auxiliary_loss_clip": 0.01156008, "auxiliary_loss_mlp": 0.01055734, "balance_loss_clip": 1.02808905, "balance_loss_mlp": 1.03966117, "epoch": 0.058499924845934165, "flos": 25551080449920.0, "grad_norm": 2.605468634185153, "language_loss": 0.87558317, "learning_rate": 3.966446670713476e-06, "loss": 0.89770055, "num_input_tokens_seen": 20759685, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.1640625, "step": 973, "time_per_iteration": 2.4687106609344482 }, { "auxiliary_loss_clip": 0.01156814, "auxiliary_loss_mlp": 0.01056868, "balance_loss_clip": 1.02561092, "balance_loss_mlp": 1.03824723, "epoch": 0.05856004809860214, "flos": 16435222882560.0, "grad_norm": 2.345147396479656, "language_loss": 0.74578172, "learning_rate": 3.9663777186444325e-06, "loss": 0.76791859, "num_input_tokens_seen": 20778180, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.1875, "step": 974, "time_per_iteration": 2.4109954833984375 }, { "auxiliary_loss_clip": 0.01153303, "auxiliary_loss_mlp": 0.01056384, "balance_loss_clip": 1.02710593, "balance_loss_mlp": 1.03830385, "epoch": 0.0586201713512701, "flos": 39457230554880.0, "grad_norm": 2.0401418582323214, "language_loss": 0.76616645, "learning_rate": 3.966308696400426e-06, "loss": 0.78826332, "num_input_tokens_seen": 20802705, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.1484375, "step": 975, "time_per_iteration": 2.5860371589660645 }, { "auxiliary_loss_clip": 0.01156982, "auxiliary_loss_mlp": 0.01055984, "balance_loss_clip": 1.02589536, "balance_loss_mlp": 1.03819263, "epoch": 0.058680294603938074, "flos": 23364120885120.0, "grad_norm": 2.3569782231287117, "language_loss": 0.76396739, "learning_rate": 3.96623960398392e-06, "loss": 0.78609711, "num_input_tokens_seen": 20822540, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.1875, "step": 976, "time_per_iteration": 2.426626443862915 }, { "auxiliary_loss_clip": 0.01154381, "auxiliary_loss_mlp": 0.01055215, "balance_loss_clip": 1.02517378, "balance_loss_mlp": 1.0368669, "epoch": 0.05874041785660604, "flos": 32230698278400.0, "grad_norm": 1.9095241002161987, "language_loss": 0.8741101, "learning_rate": 3.9661704413973805e-06, "loss": 0.89620602, "num_input_tokens_seen": 20844175, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.171875, "step": 977, "time_per_iteration": 2.501909017562866 }, { "auxiliary_loss_clip": 0.01155156, "auxiliary_loss_mlp": 0.01054398, "balance_loss_clip": 1.02616882, "balance_loss_mlp": 1.03929973, "epoch": 0.05880054110927401, "flos": 22308940730880.0, "grad_norm": 1.84656154429845, "language_loss": 0.79312801, "learning_rate": 3.966101208643276e-06, "loss": 0.81522357, "num_input_tokens_seen": 20864730, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.15625, "step": 978, "time_per_iteration": 2.425081729888916 }, { "auxiliary_loss_clip": 0.01160624, "auxiliary_loss_mlp": 0.01057484, "balance_loss_clip": 1.02701402, "balance_loss_mlp": 1.03972781, "epoch": 0.05886066436194198, "flos": 27379237305600.0, "grad_norm": 2.6661522830253337, "language_loss": 0.80714297, "learning_rate": 3.966031905724076e-06, "loss": 0.82932401, "num_input_tokens_seen": 20885200, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.203125, "step": 979, "time_per_iteration": 2.495361328125 }, { "auxiliary_loss_clip": 0.01042219, "auxiliary_loss_mlp": 0.01005382, "balance_loss_clip": 1.00101852, "balance_loss_mlp": 1.00593567, "epoch": 0.05892078761460995, "flos": 59581725338880.0, "grad_norm": 0.9253505532266535, "language_loss": 0.59051669, "learning_rate": 3.965962532642255e-06, "loss": 0.61099267, "num_input_tokens_seen": 20940325, "router_z_loss_clip": 0.04370117, "router_z_loss_mlp": 0.36328125, "step": 980, "time_per_iteration": 2.9525704383850098 }, { "auxiliary_loss_clip": 0.01152762, "auxiliary_loss_mlp": 0.01056813, "balance_loss_clip": 1.02776158, "balance_loss_mlp": 1.0376246, "epoch": 0.05898091086727792, "flos": 15413175475200.0, "grad_norm": 2.060047658786371, "language_loss": 0.86316341, "learning_rate": 3.9658930894002885e-06, "loss": 0.88525915, "num_input_tokens_seen": 20958220, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.1484375, "step": 981, "time_per_iteration": 2.410400152206421 }, { "auxiliary_loss_clip": 0.01156705, "auxiliary_loss_mlp": 0.01054771, "balance_loss_clip": 1.02780557, "balance_loss_mlp": 1.04009056, "epoch": 0.059041034119945886, "flos": 23654319039360.0, "grad_norm": 2.1100833836001347, "language_loss": 0.79749936, "learning_rate": 3.965823576000653e-06, "loss": 0.81961417, "num_input_tokens_seen": 20978920, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.171875, "step": 982, "time_per_iteration": 2.4408376216888428 }, { "auxiliary_loss_clip": 0.01163271, "auxiliary_loss_mlp": 0.01055369, "balance_loss_clip": 1.02616262, "balance_loss_mlp": 1.04094839, "epoch": 0.05910115737261386, "flos": 24752930791680.0, "grad_norm": 2.1948144071786544, "language_loss": 0.84188688, "learning_rate": 3.965753992445833e-06, "loss": 0.86407328, "num_input_tokens_seen": 20999490, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.2265625, "step": 983, "time_per_iteration": 2.453981637954712 }, { "auxiliary_loss_clip": 0.01157224, "auxiliary_loss_mlp": 0.01063252, "balance_loss_clip": 1.03204322, "balance_loss_mlp": 1.04023576, "epoch": 0.05916128062528183, "flos": 11727953291520.0, "grad_norm": 1.9029363631781626, "language_loss": 0.84873164, "learning_rate": 3.9656843387383075e-06, "loss": 0.87093639, "num_input_tokens_seen": 21017865, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.171875, "step": 984, "time_per_iteration": 2.387319564819336 }, { "auxiliary_loss_clip": 0.01152499, "auxiliary_loss_mlp": 0.01055111, "balance_loss_clip": 1.02781165, "balance_loss_mlp": 1.04044604, "epoch": 0.059221403877949795, "flos": 21902063212800.0, "grad_norm": 2.5293821744566185, "language_loss": 0.77352715, "learning_rate": 3.965614614880566e-06, "loss": 0.79560328, "num_input_tokens_seen": 21035900, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.125, "step": 985, "time_per_iteration": 2.4488773345947266 }, { "auxiliary_loss_clip": 0.01159218, "auxiliary_loss_mlp": 0.01057503, "balance_loss_clip": 1.02842832, "balance_loss_mlp": 1.04079485, "epoch": 0.05928152713061777, "flos": 20513742065280.0, "grad_norm": 2.9893172371468024, "language_loss": 0.90492582, "learning_rate": 3.965544820875094e-06, "loss": 0.92709303, "num_input_tokens_seen": 21053235, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.1875, "step": 986, "time_per_iteration": 2.443754196166992 }, { "auxiliary_loss_clip": 0.0115854, "auxiliary_loss_mlp": 0.0106082, "balance_loss_clip": 1.02829957, "balance_loss_mlp": 1.03743863, "epoch": 0.05934165038328574, "flos": 24494084904960.0, "grad_norm": 1.9787948727060412, "language_loss": 0.75887883, "learning_rate": 3.965474956724383e-06, "loss": 0.78107238, "num_input_tokens_seen": 21073090, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 1.2109375, "step": 987, "time_per_iteration": 2.464308977127075 }, { "auxiliary_loss_clip": 0.01158062, "auxiliary_loss_mlp": 0.0105224, "balance_loss_clip": 1.02315331, "balance_loss_mlp": 1.0372479, "epoch": 0.059401773635953704, "flos": 38726498465280.0, "grad_norm": 2.1718172609657764, "language_loss": 0.71649158, "learning_rate": 3.965405022430928e-06, "loss": 0.73859465, "num_input_tokens_seen": 21094895, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.2109375, "step": 988, "time_per_iteration": 2.544602870941162 }, { "auxiliary_loss_clip": 0.01040901, "auxiliary_loss_mlp": 0.01004285, "balance_loss_clip": 0.9999463, "balance_loss_mlp": 1.00449657, "epoch": 0.059461896888621676, "flos": 58020618539520.0, "grad_norm": 0.9235199678427388, "language_loss": 0.71133971, "learning_rate": 3.965335017997222e-06, "loss": 0.73179162, "num_input_tokens_seen": 21147555, "router_z_loss_clip": 0.04345703, "router_z_loss_mlp": 0.36328125, "step": 989, "time_per_iteration": 2.8911666870117188 }, { "auxiliary_loss_clip": 0.01161321, "auxiliary_loss_mlp": 0.01062306, "balance_loss_clip": 1.02914214, "balance_loss_mlp": 1.03849137, "epoch": 0.05952202014128964, "flos": 22126659189120.0, "grad_norm": 1.9788870334050774, "language_loss": 0.77683198, "learning_rate": 3.965264943425766e-06, "loss": 0.79906827, "num_input_tokens_seen": 21167845, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 1.2265625, "step": 990, "time_per_iteration": 2.424315929412842 }, { "auxiliary_loss_clip": 0.01153705, "auxiliary_loss_mlp": 0.01049102, "balance_loss_clip": 1.01887035, "balance_loss_mlp": 1.0374701, "epoch": 0.059582143393957614, "flos": 20444823308160.0, "grad_norm": 2.493270788674315, "language_loss": 0.85957623, "learning_rate": 3.965194798719059e-06, "loss": 0.88160431, "num_input_tokens_seen": 21185085, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.1640625, "step": 991, "time_per_iteration": 2.4135589599609375 }, { "auxiliary_loss_clip": 0.01158141, "auxiliary_loss_mlp": 0.01059113, "balance_loss_clip": 1.02890551, "balance_loss_mlp": 1.03746819, "epoch": 0.059642266646625586, "flos": 20593832457600.0, "grad_norm": 2.002948820668704, "language_loss": 0.76866162, "learning_rate": 3.965124583879604e-06, "loss": 0.79083419, "num_input_tokens_seen": 21204230, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.2109375, "step": 992, "time_per_iteration": 2.4235281944274902 }, { "auxiliary_loss_clip": 0.01162354, "auxiliary_loss_mlp": 0.01059023, "balance_loss_clip": 1.03100932, "balance_loss_mlp": 1.04212487, "epoch": 0.05970238989929355, "flos": 19351692639360.0, "grad_norm": 2.425489978258854, "language_loss": 0.74587756, "learning_rate": 3.965054298909908e-06, "loss": 0.76809132, "num_input_tokens_seen": 21222655, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.203125, "step": 993, "time_per_iteration": 2.4192397594451904 }, { "auxiliary_loss_clip": 0.01157142, "auxiliary_loss_mlp": 0.01057147, "balance_loss_clip": 1.02805996, "balance_loss_mlp": 1.04095399, "epoch": 0.05976251315196152, "flos": 30262713960960.0, "grad_norm": 3.039263163999806, "language_loss": 0.79152131, "learning_rate": 3.964983943812479e-06, "loss": 0.8136642, "num_input_tokens_seen": 21242310, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.15625, "step": 994, "time_per_iteration": 2.4724812507629395 }, { "auxiliary_loss_clip": 0.01154745, "auxiliary_loss_mlp": 0.0106191, "balance_loss_clip": 1.032179, "balance_loss_mlp": 1.03938246, "epoch": 0.05982263640462949, "flos": 23184038759040.0, "grad_norm": 2.871359616757743, "language_loss": 0.8020556, "learning_rate": 3.964913518589827e-06, "loss": 0.82422209, "num_input_tokens_seen": 21261410, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.15625, "step": 995, "time_per_iteration": 2.4379522800445557 }, { "auxiliary_loss_clip": 0.01155303, "auxiliary_loss_mlp": 0.01063218, "balance_loss_clip": 1.03384519, "balance_loss_mlp": 1.03777981, "epoch": 0.05988275965729746, "flos": 27849761965440.0, "grad_norm": 2.2098995933085894, "language_loss": 0.8701334, "learning_rate": 3.964843023244466e-06, "loss": 0.89231861, "num_input_tokens_seen": 21280080, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.171875, "step": 996, "time_per_iteration": 2.4437999725341797 }, { "auxiliary_loss_clip": 0.01159118, "auxiliary_loss_mlp": 0.01064769, "balance_loss_clip": 1.03220057, "balance_loss_mlp": 1.04127562, "epoch": 0.05994288290996543, "flos": 24678880064640.0, "grad_norm": 3.8608888713812597, "language_loss": 0.88007629, "learning_rate": 3.964772457778912e-06, "loss": 0.90231526, "num_input_tokens_seen": 21296765, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 1.1796875, "step": 997, "time_per_iteration": 2.443021297454834 }, { "auxiliary_loss_clip": 0.01038178, "auxiliary_loss_mlp": 0.01003311, "balance_loss_clip": 0.99923432, "balance_loss_mlp": 1.003317, "epoch": 0.0600030061626334, "flos": 69925965782400.0, "grad_norm": 1.0099586250694919, "language_loss": 0.75391841, "learning_rate": 3.964701822195683e-06, "loss": 0.7743333, "num_input_tokens_seen": 21363345, "router_z_loss_clip": 0.04077148, "router_z_loss_mlp": 0.34765625, "step": 998, "time_per_iteration": 3.1126739978790283 }, { "auxiliary_loss_clip": 0.01157325, "auxiliary_loss_mlp": 0.01061226, "balance_loss_clip": 1.03156662, "balance_loss_mlp": 1.0397613, "epoch": 0.06006312941530137, "flos": 26538982680960.0, "grad_norm": 2.049004464099992, "language_loss": 0.75884998, "learning_rate": 3.9646311164973e-06, "loss": 0.78103548, "num_input_tokens_seen": 21385290, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.171875, "step": 999, "time_per_iteration": 2.4973855018615723 }, { "auxiliary_loss_clip": 0.01157772, "auxiliary_loss_mlp": 0.01056214, "balance_loss_clip": 1.02531469, "balance_loss_mlp": 1.03844833, "epoch": 0.060123252667969335, "flos": 27342787979520.0, "grad_norm": 1.7610372518845114, "language_loss": 0.82862902, "learning_rate": 3.9645603406862846e-06, "loss": 0.85076886, "num_input_tokens_seen": 21407625, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.1953125, "step": 1000, "time_per_iteration": 2.4534127712249756 }, { "auxiliary_loss_clip": 0.01156995, "auxiliary_loss_mlp": 0.01057862, "balance_loss_clip": 1.02845311, "balance_loss_mlp": 1.03943276, "epoch": 0.06018337592063731, "flos": 27015477183360.0, "grad_norm": 4.36013563541894, "language_loss": 0.85889578, "learning_rate": 3.964489494765166e-06, "loss": 0.88104439, "num_input_tokens_seen": 21426835, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.171875, "step": 1001, "time_per_iteration": 3.905280113220215 }, { "auxiliary_loss_clip": 0.01154827, "auxiliary_loss_mlp": 0.01052253, "balance_loss_clip": 1.02483511, "balance_loss_mlp": 1.04005575, "epoch": 0.06024349917330528, "flos": 25591788961920.0, "grad_norm": 2.18419448899442, "language_loss": 0.74045211, "learning_rate": 3.96441857873647e-06, "loss": 0.76252288, "num_input_tokens_seen": 21444920, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.1484375, "step": 1002, "time_per_iteration": 2.4591424465179443 }, { "auxiliary_loss_clip": 0.01155412, "auxiliary_loss_mlp": 0.01053095, "balance_loss_clip": 1.02345991, "balance_loss_mlp": 1.03853083, "epoch": 0.060303622425973244, "flos": 26132279719680.0, "grad_norm": 2.3045699377531497, "language_loss": 0.75484115, "learning_rate": 3.964347592602728e-06, "loss": 0.77692622, "num_input_tokens_seen": 21463555, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.171875, "step": 1003, "time_per_iteration": 2.4360148906707764 }, { "auxiliary_loss_clip": 0.01162091, "auxiliary_loss_mlp": 0.01052213, "balance_loss_clip": 1.0206461, "balance_loss_mlp": 1.04050243, "epoch": 0.060363745678641216, "flos": 20376114019200.0, "grad_norm": 2.4424858968040235, "language_loss": 0.69722307, "learning_rate": 3.964276536366473e-06, "loss": 0.71936619, "num_input_tokens_seen": 21481990, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 1.21875, "step": 1004, "time_per_iteration": 5.30731987953186 }, { "auxiliary_loss_clip": 0.01162351, "auxiliary_loss_mlp": 0.01062383, "balance_loss_clip": 1.03300953, "balance_loss_mlp": 1.04186904, "epoch": 0.06042386893130918, "flos": 17748201582720.0, "grad_norm": 2.2180426128467743, "language_loss": 0.83568144, "learning_rate": 3.964205410030241e-06, "loss": 0.85792875, "num_input_tokens_seen": 21500385, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.203125, "step": 1005, "time_per_iteration": 2.4257004261016846 }, { "auxiliary_loss_clip": 0.01037841, "auxiliary_loss_mlp": 0.01008964, "balance_loss_clip": 1.00495863, "balance_loss_mlp": 1.00299001, "epoch": 0.06048399218397715, "flos": 68535061194240.0, "grad_norm": 0.9146307501256149, "language_loss": 0.59042352, "learning_rate": 3.964134213596571e-06, "loss": 0.61089152, "num_input_tokens_seen": 21561040, "router_z_loss_clip": 0.04003906, "router_z_loss_mlp": 0.34765625, "step": 1006, "time_per_iteration": 3.080986261367798 }, { "auxiliary_loss_clip": 0.0115253, "auxiliary_loss_mlp": 0.01054203, "balance_loss_clip": 1.02444792, "balance_loss_mlp": 1.03542399, "epoch": 0.060544115436645125, "flos": 23257391258880.0, "grad_norm": 4.936143666476675, "language_loss": 0.74330884, "learning_rate": 3.964062947068003e-06, "loss": 0.76537621, "num_input_tokens_seen": 21580655, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.171875, "step": 1007, "time_per_iteration": 3.882143497467041 }, { "auxiliary_loss_clip": 0.01155834, "auxiliary_loss_mlp": 0.01052496, "balance_loss_clip": 1.02348018, "balance_loss_mlp": 1.03754771, "epoch": 0.06060423868931309, "flos": 23877309093120.0, "grad_norm": 1.740738530070099, "language_loss": 0.80621183, "learning_rate": 3.9639916104470804e-06, "loss": 0.82829511, "num_input_tokens_seen": 21599650, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.1875, "step": 1008, "time_per_iteration": 2.4477415084838867 }, { "auxiliary_loss_clip": 0.01158471, "auxiliary_loss_mlp": 0.01053834, "balance_loss_clip": 1.0252471, "balance_loss_mlp": 1.04050589, "epoch": 0.06066436194198106, "flos": 18727236328320.0, "grad_norm": 1.7879989603434563, "language_loss": 0.77816951, "learning_rate": 3.9639202037363494e-06, "loss": 0.80029255, "num_input_tokens_seen": 21617550, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.1796875, "step": 1009, "time_per_iteration": 2.418851613998413 }, { "auxiliary_loss_clip": 0.01152606, "auxiliary_loss_mlp": 0.01048527, "balance_loss_clip": 1.02015519, "balance_loss_mlp": 1.03950274, "epoch": 0.06072448519464903, "flos": 24639428361600.0, "grad_norm": 1.7972262110622221, "language_loss": 0.92497772, "learning_rate": 3.9638487269383575e-06, "loss": 0.94698906, "num_input_tokens_seen": 21635865, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.1328125, "step": 1010, "time_per_iteration": 2.451969623565674 }, { "auxiliary_loss_clip": 0.01159785, "auxiliary_loss_mlp": 0.01057949, "balance_loss_clip": 1.0270505, "balance_loss_mlp": 1.03740323, "epoch": 0.060784608447317, "flos": 17378017770240.0, "grad_norm": 3.6712956960038796, "language_loss": 0.71411031, "learning_rate": 3.9637771800556576e-06, "loss": 0.73628759, "num_input_tokens_seen": 21653945, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.2265625, "step": 1011, "time_per_iteration": 2.4284327030181885 }, { "auxiliary_loss_clip": 0.01158641, "auxiliary_loss_mlp": 0.01067488, "balance_loss_clip": 1.03451467, "balance_loss_mlp": 1.03772664, "epoch": 0.06084473169998497, "flos": 23691187301760.0, "grad_norm": 2.2199777315452334, "language_loss": 0.8743695, "learning_rate": 3.963705563090801e-06, "loss": 0.89663088, "num_input_tokens_seen": 21671230, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 1.2109375, "step": 1012, "time_per_iteration": 2.440599203109741 }, { "auxiliary_loss_clip": 0.01152822, "auxiliary_loss_mlp": 0.01049775, "balance_loss_clip": 1.02074802, "balance_loss_mlp": 1.03609371, "epoch": 0.06090485495265294, "flos": 23545320174720.0, "grad_norm": 2.3187281216043703, "language_loss": 0.76561666, "learning_rate": 3.963633876046344e-06, "loss": 0.78764266, "num_input_tokens_seen": 21691155, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.171875, "step": 1013, "time_per_iteration": 2.475726366043091 }, { "auxiliary_loss_clip": 0.01157383, "auxiliary_loss_mlp": 0.01060122, "balance_loss_clip": 1.02834129, "balance_loss_mlp": 1.03816497, "epoch": 0.06096497820532091, "flos": 20338268238720.0, "grad_norm": 2.384126716604103, "language_loss": 0.85482019, "learning_rate": 3.963562118924844e-06, "loss": 0.87699521, "num_input_tokens_seen": 21707405, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 1.1953125, "step": 1014, "time_per_iteration": 2.417125701904297 }, { "auxiliary_loss_clip": 0.01160361, "auxiliary_loss_mlp": 0.01056803, "balance_loss_clip": 1.02313781, "balance_loss_mlp": 1.03910649, "epoch": 0.061025101457988874, "flos": 26937935320320.0, "grad_norm": 2.246085630291627, "language_loss": 0.73297465, "learning_rate": 3.963490291728864e-06, "loss": 0.75514627, "num_input_tokens_seen": 21728090, "router_z_loss_clip": 0.3359375, "router_z_loss_mlp": 1.2109375, "step": 1015, "time_per_iteration": 2.483424186706543 }, { "auxiliary_loss_clip": 0.0115166, "auxiliary_loss_mlp": 0.0105429, "balance_loss_clip": 1.02441573, "balance_loss_mlp": 1.03619528, "epoch": 0.061085224710656846, "flos": 25373861055360.0, "grad_norm": 1.6809549189182948, "language_loss": 0.7901845, "learning_rate": 3.963418394460966e-06, "loss": 0.812244, "num_input_tokens_seen": 21747950, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.15625, "step": 1016, "time_per_iteration": 2.439598798751831 }, { "auxiliary_loss_clip": 0.01157869, "auxiliary_loss_mlp": 0.01047841, "balance_loss_clip": 1.01918364, "balance_loss_mlp": 1.03926146, "epoch": 0.06114534796332482, "flos": 24823664939520.0, "grad_norm": 1.7343598991326854, "language_loss": 0.75973874, "learning_rate": 3.9633464271237166e-06, "loss": 0.78179586, "num_input_tokens_seen": 21767900, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.1875, "step": 1017, "time_per_iteration": 2.4395880699157715 }, { "auxiliary_loss_clip": 0.0116119, "auxiliary_loss_mlp": 0.01061209, "balance_loss_clip": 1.03021479, "balance_loss_mlp": 1.04041672, "epoch": 0.061205471215992784, "flos": 20630386517760.0, "grad_norm": 2.478866834537974, "language_loss": 0.85801131, "learning_rate": 3.963274389719682e-06, "loss": 0.88023531, "num_input_tokens_seen": 21787375, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 1.203125, "step": 1018, "time_per_iteration": 2.4153647422790527 }, { "auxiliary_loss_clip": 0.01155674, "auxiliary_loss_mlp": 0.01058413, "balance_loss_clip": 1.02789545, "balance_loss_mlp": 1.03959513, "epoch": 0.061265594468660756, "flos": 16507423307520.0, "grad_norm": 7.811235902239468, "language_loss": 0.76732063, "learning_rate": 3.963202282251436e-06, "loss": 0.78946149, "num_input_tokens_seen": 21806275, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.15625, "step": 1019, "time_per_iteration": 2.393094062805176 }, { "auxiliary_loss_clip": 0.01156278, "auxiliary_loss_mlp": 0.01053328, "balance_loss_clip": 1.02164245, "balance_loss_mlp": 1.03888106, "epoch": 0.06132571772132872, "flos": 26245118833920.0, "grad_norm": 2.2044878885481083, "language_loss": 0.84023499, "learning_rate": 3.96313010472155e-06, "loss": 0.86233103, "num_input_tokens_seen": 21826430, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 1.171875, "step": 1020, "time_per_iteration": 2.4407336711883545 }, { "auxiliary_loss_clip": 0.01159264, "auxiliary_loss_mlp": 0.01055516, "balance_loss_clip": 1.02580905, "balance_loss_mlp": 1.04091072, "epoch": 0.06138584097399669, "flos": 37413275385600.0, "grad_norm": 2.1251693510396987, "language_loss": 0.79392493, "learning_rate": 3.963057857132601e-06, "loss": 0.8160727, "num_input_tokens_seen": 21847800, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.1875, "step": 1021, "time_per_iteration": 2.55255126953125 }, { "auxiliary_loss_clip": 0.01154293, "auxiliary_loss_mlp": 0.01057299, "balance_loss_clip": 1.03082263, "balance_loss_mlp": 1.03812289, "epoch": 0.061445964226664665, "flos": 17419703800320.0, "grad_norm": 1.8709309273080235, "language_loss": 0.87560797, "learning_rate": 3.962985539487165e-06, "loss": 0.89772391, "num_input_tokens_seen": 21863385, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.15625, "step": 1022, "time_per_iteration": 2.377963066101074 }, { "auxiliary_loss_clip": 0.01157659, "auxiliary_loss_mlp": 0.01053407, "balance_loss_clip": 1.02510691, "balance_loss_mlp": 1.04006875, "epoch": 0.06150608747933263, "flos": 22598964328320.0, "grad_norm": 3.5561262995454856, "language_loss": 0.82924676, "learning_rate": 3.962913151787826e-06, "loss": 0.85135746, "num_input_tokens_seen": 21881880, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.171875, "step": 1023, "time_per_iteration": 2.4481008052825928 }, { "auxiliary_loss_clip": 0.01039811, "auxiliary_loss_mlp": 0.01006344, "balance_loss_clip": 1.00257671, "balance_loss_mlp": 1.0057925, "epoch": 0.0615662107320006, "flos": 56738712816000.0, "grad_norm": 0.892435407282451, "language_loss": 0.65076607, "learning_rate": 3.962840694037165e-06, "loss": 0.67122757, "num_input_tokens_seen": 21940550, "router_z_loss_clip": 0.03759766, "router_z_loss_mlp": 0.33984375, "step": 1024, "time_per_iteration": 3.0637269020080566 }, { "auxiliary_loss_clip": 0.011566, "auxiliary_loss_mlp": 0.01058846, "balance_loss_clip": 1.027637, "balance_loss_mlp": 1.03874087, "epoch": 0.06162633398466857, "flos": 22563701988480.0, "grad_norm": 2.121991078512927, "language_loss": 0.88018882, "learning_rate": 3.962768166237768e-06, "loss": 0.90234327, "num_input_tokens_seen": 21958390, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.1796875, "step": 1025, "time_per_iteration": 2.4291317462921143 }, { "auxiliary_loss_clip": 0.01158191, "auxiliary_loss_mlp": 0.0104929, "balance_loss_clip": 1.02213371, "balance_loss_mlp": 1.04083061, "epoch": 0.06168645723733654, "flos": 25591928607360.0, "grad_norm": 1.944980768345882, "language_loss": 0.84539229, "learning_rate": 3.9626955683922264e-06, "loss": 0.86746705, "num_input_tokens_seen": 21978625, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.171875, "step": 1026, "time_per_iteration": 2.4521775245666504 }, { "auxiliary_loss_clip": 0.01161698, "auxiliary_loss_mlp": 0.01052541, "balance_loss_clip": 1.02338207, "balance_loss_mlp": 1.04109931, "epoch": 0.06174658049000451, "flos": 15996993096960.0, "grad_norm": 2.258709835405207, "language_loss": 0.82325631, "learning_rate": 3.962622900503127e-06, "loss": 0.84539866, "num_input_tokens_seen": 21996035, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.203125, "step": 1027, "time_per_iteration": 2.4000582695007324 }, { "auxiliary_loss_clip": 0.01152861, "auxiliary_loss_mlp": 0.01051801, "balance_loss_clip": 1.02383435, "balance_loss_mlp": 1.0380646, "epoch": 0.06180670374267248, "flos": 11285324674560.0, "grad_norm": 2.537469324710815, "language_loss": 0.84134269, "learning_rate": 3.962550162573065e-06, "loss": 0.86338931, "num_input_tokens_seen": 22011625, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.1484375, "step": 1028, "time_per_iteration": 2.358281373977661 }, { "auxiliary_loss_clip": 0.01037697, "auxiliary_loss_mlp": 0.01007238, "balance_loss_clip": 1.00339997, "balance_loss_mlp": 1.00318575, "epoch": 0.06186682699534045, "flos": 65127224695680.0, "grad_norm": 0.9629596283558131, "language_loss": 0.60529995, "learning_rate": 3.962477354604636e-06, "loss": 0.62574935, "num_input_tokens_seen": 22066035, "router_z_loss_clip": 0.03833008, "router_z_loss_mlp": 0.34375, "step": 1029, "time_per_iteration": 2.864759683609009 }, { "auxiliary_loss_clip": 0.01150987, "auxiliary_loss_mlp": 0.01053206, "balance_loss_clip": 1.02463198, "balance_loss_mlp": 1.03651297, "epoch": 0.061926950248008414, "flos": 21104681604480.0, "grad_norm": 4.849598134920486, "language_loss": 0.82339936, "learning_rate": 3.962404476600438e-06, "loss": 0.84544134, "num_input_tokens_seen": 22085015, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.140625, "step": 1030, "time_per_iteration": 2.394225597381592 }, { "auxiliary_loss_clip": 0.01161727, "auxiliary_loss_mlp": 0.01062051, "balance_loss_clip": 1.03181958, "balance_loss_mlp": 1.0407145, "epoch": 0.061987073500676386, "flos": 17747503355520.0, "grad_norm": 2.727803687845987, "language_loss": 0.7986154, "learning_rate": 3.962331528563072e-06, "loss": 0.82085317, "num_input_tokens_seen": 22102775, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.2109375, "step": 1031, "time_per_iteration": 2.4070053100585938 }, { "auxiliary_loss_clip": 0.01156936, "auxiliary_loss_mlp": 0.01061886, "balance_loss_clip": 1.03142774, "balance_loss_mlp": 1.03971457, "epoch": 0.06204719675334436, "flos": 21835134403200.0, "grad_norm": 1.6632160897947257, "language_loss": 0.77500129, "learning_rate": 3.962258510495142e-06, "loss": 0.79718953, "num_input_tokens_seen": 22121680, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.171875, "step": 1032, "time_per_iteration": 2.404686450958252 }, { "auxiliary_loss_clip": 0.01158514, "auxiliary_loss_mlp": 0.01062509, "balance_loss_clip": 1.0319314, "balance_loss_mlp": 1.03803051, "epoch": 0.06210732000601232, "flos": 19352705068800.0, "grad_norm": 2.252385851128305, "language_loss": 0.88763595, "learning_rate": 3.962185422399254e-06, "loss": 0.90984619, "num_input_tokens_seen": 22138155, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 1.203125, "step": 1033, "time_per_iteration": 2.3899827003479004 }, { "auxiliary_loss_clip": 0.01156533, "auxiliary_loss_mlp": 0.01060124, "balance_loss_clip": 1.03165722, "balance_loss_mlp": 1.03904891, "epoch": 0.062167443258680295, "flos": 24748357403520.0, "grad_norm": 2.1112015481619135, "language_loss": 0.85067034, "learning_rate": 3.962112264278014e-06, "loss": 0.87283695, "num_input_tokens_seen": 22157420, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.171875, "step": 1034, "time_per_iteration": 2.417952299118042 }, { "auxiliary_loss_clip": 0.01152025, "auxiliary_loss_mlp": 0.0105143, "balance_loss_clip": 1.02109075, "balance_loss_mlp": 1.04026592, "epoch": 0.06222756651134827, "flos": 34457074634880.0, "grad_norm": 2.0175474623906156, "language_loss": 0.80539238, "learning_rate": 3.962039036134035e-06, "loss": 0.82742691, "num_input_tokens_seen": 22178620, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.1171875, "step": 1035, "time_per_iteration": 2.5118720531463623 }, { "auxiliary_loss_clip": 0.01158328, "auxiliary_loss_mlp": 0.01052676, "balance_loss_clip": 1.02084732, "balance_loss_mlp": 1.0402739, "epoch": 0.06228768976401623, "flos": 25665281107200.0, "grad_norm": 2.7436331301329893, "language_loss": 0.78723359, "learning_rate": 3.961965737969931e-06, "loss": 0.8093437, "num_input_tokens_seen": 22197125, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 1.1796875, "step": 1036, "time_per_iteration": 2.429211139678955 }, { "auxiliary_loss_clip": 0.01154011, "auxiliary_loss_mlp": 0.01055222, "balance_loss_clip": 1.02690959, "balance_loss_mlp": 1.03979087, "epoch": 0.062347813016684205, "flos": 25294608535680.0, "grad_norm": 1.8662455074359048, "language_loss": 0.86611468, "learning_rate": 3.961892369788315e-06, "loss": 0.88820702, "num_input_tokens_seen": 22217575, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.140625, "step": 1037, "time_per_iteration": 2.4833195209503174 }, { "auxiliary_loss_clip": 0.01152175, "auxiliary_loss_mlp": 0.01053944, "balance_loss_clip": 1.02109003, "balance_loss_mlp": 1.03708446, "epoch": 0.06240793626935217, "flos": 26905815002880.0, "grad_norm": 2.290920851884523, "language_loss": 0.80359685, "learning_rate": 3.961818931591808e-06, "loss": 0.82565802, "num_input_tokens_seen": 22236840, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 1.15625, "step": 1038, "time_per_iteration": 2.4463109970092773 }, { "auxiliary_loss_clip": 0.01153569, "auxiliary_loss_mlp": 0.01057785, "balance_loss_clip": 1.02872229, "balance_loss_mlp": 1.0402391, "epoch": 0.06246805952202014, "flos": 21614727790080.0, "grad_norm": 3.0631812454019824, "language_loss": 0.85687834, "learning_rate": 3.961745423383028e-06, "loss": 0.87899184, "num_input_tokens_seen": 22256465, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.1328125, "step": 1039, "time_per_iteration": 2.4449520111083984 }, { "auxiliary_loss_clip": 0.01155556, "auxiliary_loss_mlp": 0.01059921, "balance_loss_clip": 1.0290575, "balance_loss_mlp": 1.03914809, "epoch": 0.0625281827746881, "flos": 19311053950080.0, "grad_norm": 1.8935037623048254, "language_loss": 0.80690914, "learning_rate": 3.961671845164602e-06, "loss": 0.82906389, "num_input_tokens_seen": 22274025, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.1640625, "step": 1040, "time_per_iteration": 3.8361432552337646 }, { "auxiliary_loss_clip": 0.01157663, "auxiliary_loss_mlp": 0.01059855, "balance_loss_clip": 1.03005266, "balance_loss_mlp": 1.04205906, "epoch": 0.06258830602735609, "flos": 27744533527680.0, "grad_norm": 8.969843761282052, "language_loss": 0.69530857, "learning_rate": 3.961598196939153e-06, "loss": 0.71748376, "num_input_tokens_seen": 22292245, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.15625, "step": 1041, "time_per_iteration": 2.4421133995056152 }, { "auxiliary_loss_clip": 0.01153717, "auxiliary_loss_mlp": 0.01054046, "balance_loss_clip": 1.02290881, "balance_loss_mlp": 1.03592014, "epoch": 0.06264842928002405, "flos": 23221465603200.0, "grad_norm": 2.1600309028167017, "language_loss": 0.81277382, "learning_rate": 3.961524478709311e-06, "loss": 0.8348515, "num_input_tokens_seen": 22311455, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.1796875, "step": 1042, "time_per_iteration": 2.4429678916931152 }, { "auxiliary_loss_clip": 0.01155927, "auxiliary_loss_mlp": 0.01047444, "balance_loss_clip": 1.01816618, "balance_loss_mlp": 1.03883386, "epoch": 0.06270855253269202, "flos": 38397965771520.0, "grad_norm": 1.6556398191388253, "language_loss": 0.76052594, "learning_rate": 3.961450690477705e-06, "loss": 0.78255963, "num_input_tokens_seen": 22333750, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.171875, "step": 1043, "time_per_iteration": 3.930548906326294 }, { "auxiliary_loss_clip": 0.0115033, "auxiliary_loss_mlp": 0.01048839, "balance_loss_clip": 1.02066958, "balance_loss_mlp": 1.03881478, "epoch": 0.06276867578535998, "flos": 22452503708160.0, "grad_norm": 2.1727494522463116, "language_loss": 0.92467427, "learning_rate": 3.961376832246969e-06, "loss": 0.946666, "num_input_tokens_seen": 22351940, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.1171875, "step": 1044, "time_per_iteration": 3.851454973220825 }, { "auxiliary_loss_clip": 0.01153806, "auxiliary_loss_mlp": 0.01051929, "balance_loss_clip": 1.02408147, "balance_loss_mlp": 1.03988838, "epoch": 0.06282879903802796, "flos": 22929312412800.0, "grad_norm": 2.6175374391353987, "language_loss": 0.86091137, "learning_rate": 3.96130290401974e-06, "loss": 0.88296872, "num_input_tokens_seen": 22372085, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.140625, "step": 1045, "time_per_iteration": 2.4562478065490723 }, { "auxiliary_loss_clip": 0.01147997, "auxiliary_loss_mlp": 0.01058226, "balance_loss_clip": 1.03123653, "balance_loss_mlp": 1.0371809, "epoch": 0.06288892229069593, "flos": 34817937114240.0, "grad_norm": 2.050724739218883, "language_loss": 0.78363693, "learning_rate": 3.961228905798655e-06, "loss": 0.80569911, "num_input_tokens_seen": 22392020, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.109375, "step": 1046, "time_per_iteration": 3.8668949604034424 }, { "auxiliary_loss_clip": 0.01154558, "auxiliary_loss_mlp": 0.01061772, "balance_loss_clip": 1.03345942, "balance_loss_mlp": 1.03891706, "epoch": 0.06294904554336389, "flos": 19426127391360.0, "grad_norm": 2.827178720419603, "language_loss": 0.77426672, "learning_rate": 3.961154837586356e-06, "loss": 0.79642999, "num_input_tokens_seen": 22411180, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.15625, "step": 1047, "time_per_iteration": 2.403306007385254 }, { "auxiliary_loss_clip": 0.01158847, "auxiliary_loss_mlp": 0.0105585, "balance_loss_clip": 1.02571368, "balance_loss_mlp": 1.03932309, "epoch": 0.06300916879603187, "flos": 40660267783680.0, "grad_norm": 2.2582086957955054, "language_loss": 0.7676698, "learning_rate": 3.961080699385484e-06, "loss": 0.78981674, "num_input_tokens_seen": 22435105, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.1953125, "step": 1048, "time_per_iteration": 2.585777521133423 }, { "auxiliary_loss_clip": 0.01159709, "auxiliary_loss_mlp": 0.01053575, "balance_loss_clip": 1.02439284, "balance_loss_mlp": 1.040627, "epoch": 0.06306929204869983, "flos": 23803048897920.0, "grad_norm": 2.868288878903169, "language_loss": 0.77440327, "learning_rate": 3.961006491198688e-06, "loss": 0.79653615, "num_input_tokens_seen": 22452710, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.1953125, "step": 1049, "time_per_iteration": 2.448014259338379 }, { "auxiliary_loss_clip": 0.01154987, "auxiliary_loss_mlp": 0.0105744, "balance_loss_clip": 1.02704167, "balance_loss_mlp": 1.03871131, "epoch": 0.0631294153013678, "flos": 18914824396800.0, "grad_norm": 2.1328334411159666, "language_loss": 0.83224154, "learning_rate": 3.960932213028614e-06, "loss": 0.85436583, "num_input_tokens_seen": 22470175, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.1640625, "step": 1050, "time_per_iteration": 2.428130626678467 }, { "auxiliary_loss_clip": 0.01153997, "auxiliary_loss_mlp": 0.0105438, "balance_loss_clip": 1.02810645, "balance_loss_mlp": 1.04045296, "epoch": 0.06318953855403578, "flos": 24279019729920.0, "grad_norm": 2.0519250920100536, "language_loss": 0.76973629, "learning_rate": 3.960857864877913e-06, "loss": 0.79182005, "num_input_tokens_seen": 22490020, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.1328125, "step": 1051, "time_per_iteration": 2.432659864425659 }, { "auxiliary_loss_clip": 0.01155369, "auxiliary_loss_mlp": 0.01063899, "balance_loss_clip": 1.03567064, "balance_loss_mlp": 1.03871274, "epoch": 0.06324966180670374, "flos": 22527811244160.0, "grad_norm": 2.0842928281458883, "language_loss": 0.80101454, "learning_rate": 3.960783446749239e-06, "loss": 0.82320726, "num_input_tokens_seen": 22509685, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.171875, "step": 1052, "time_per_iteration": 2.4301910400390625 }, { "auxiliary_loss_clip": 0.01156457, "auxiliary_loss_mlp": 0.01054979, "balance_loss_clip": 1.0256772, "balance_loss_mlp": 1.03852856, "epoch": 0.06330978505937171, "flos": 15777214888320.0, "grad_norm": 2.4383696598633495, "language_loss": 0.78276086, "learning_rate": 3.960708958645247e-06, "loss": 0.8048752, "num_input_tokens_seen": 22527905, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.1796875, "step": 1053, "time_per_iteration": 2.3998453617095947 }, { "auxiliary_loss_clip": 0.01154537, "auxiliary_loss_mlp": 0.01047982, "balance_loss_clip": 1.02014709, "balance_loss_mlp": 1.03847611, "epoch": 0.06336990831203967, "flos": 21470012737920.0, "grad_norm": 1.9432053338143196, "language_loss": 0.84447843, "learning_rate": 3.960634400568597e-06, "loss": 0.86650366, "num_input_tokens_seen": 22546335, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.15625, "step": 1054, "time_per_iteration": 2.4102094173431396 }, { "auxiliary_loss_clip": 0.01153083, "auxiliary_loss_mlp": 0.01058387, "balance_loss_clip": 1.03110003, "balance_loss_mlp": 1.03914165, "epoch": 0.06343003156470765, "flos": 18477886331520.0, "grad_norm": 2.41561749478276, "language_loss": 0.85629678, "learning_rate": 3.9605597725219485e-06, "loss": 0.87841147, "num_input_tokens_seen": 22563885, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.140625, "step": 1055, "time_per_iteration": 2.3968770503997803 }, { "auxiliary_loss_clip": 0.01155734, "auxiliary_loss_mlp": 0.01060315, "balance_loss_clip": 1.02896309, "balance_loss_mlp": 1.03833985, "epoch": 0.06349015481737562, "flos": 25153733733120.0, "grad_norm": 2.6139634882193867, "language_loss": 0.8117063, "learning_rate": 3.960485074507964e-06, "loss": 0.83386678, "num_input_tokens_seen": 22583035, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.171875, "step": 1056, "time_per_iteration": 2.4474260807037354 }, { "auxiliary_loss_clip": 0.01159162, "auxiliary_loss_mlp": 0.01056713, "balance_loss_clip": 1.02288127, "balance_loss_mlp": 1.03690875, "epoch": 0.06355027807004358, "flos": 26870517751680.0, "grad_norm": 2.441038065041055, "language_loss": 0.80776274, "learning_rate": 3.960410306529311e-06, "loss": 0.82992148, "num_input_tokens_seen": 22605055, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 1.21875, "step": 1057, "time_per_iteration": 2.497753858566284 }, { "auxiliary_loss_clip": 0.01145479, "auxiliary_loss_mlp": 0.01053419, "balance_loss_clip": 1.02721691, "balance_loss_mlp": 1.03655159, "epoch": 0.06361040132271156, "flos": 21395647808640.0, "grad_norm": 1.8377619202367705, "language_loss": 0.83484435, "learning_rate": 3.960335468588656e-06, "loss": 0.85683334, "num_input_tokens_seen": 22623760, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.09375, "step": 1058, "time_per_iteration": 2.4473915100097656 }, { "auxiliary_loss_clip": 0.01150486, "auxiliary_loss_mlp": 0.01053281, "balance_loss_clip": 1.02176166, "balance_loss_mlp": 1.03529727, "epoch": 0.06367052457537953, "flos": 25732733587200.0, "grad_norm": 2.1521473809757206, "language_loss": 0.87502033, "learning_rate": 3.960260560688672e-06, "loss": 0.89705795, "num_input_tokens_seen": 22643000, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 1.1484375, "step": 1059, "time_per_iteration": 2.487631320953369 }, { "auxiliary_loss_clip": 0.01157359, "auxiliary_loss_mlp": 0.0105957, "balance_loss_clip": 1.03045952, "balance_loss_mlp": 1.04114223, "epoch": 0.0637306478280475, "flos": 17630684346240.0, "grad_norm": 2.4567529624321938, "language_loss": 0.91952676, "learning_rate": 3.96018558283203e-06, "loss": 0.94169605, "num_input_tokens_seen": 22660460, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.15625, "step": 1060, "time_per_iteration": 2.4338083267211914 }, { "auxiliary_loss_clip": 0.01153993, "auxiliary_loss_mlp": 0.01054981, "balance_loss_clip": 1.02560759, "balance_loss_mlp": 1.03712416, "epoch": 0.06379077108071547, "flos": 13661757521280.0, "grad_norm": 2.079161993849353, "language_loss": 0.8758902, "learning_rate": 3.960110535021406e-06, "loss": 0.89797997, "num_input_tokens_seen": 22679270, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.171875, "step": 1061, "time_per_iteration": 2.447174549102783 }, { "auxiliary_loss_clip": 0.01159231, "auxiliary_loss_mlp": 0.01055013, "balance_loss_clip": 1.02487659, "balance_loss_mlp": 1.03590798, "epoch": 0.06385089433338344, "flos": 28477499944320.0, "grad_norm": 2.4121893917422734, "language_loss": 0.7742179, "learning_rate": 3.96003541725948e-06, "loss": 0.79636031, "num_input_tokens_seen": 22699330, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.234375, "step": 1062, "time_per_iteration": 2.530794620513916 }, { "auxiliary_loss_clip": 0.01152508, "auxiliary_loss_mlp": 0.01056336, "balance_loss_clip": 1.02829826, "balance_loss_mlp": 1.03505027, "epoch": 0.0639110175860514, "flos": 24310057795200.0, "grad_norm": 3.397358567992743, "language_loss": 0.8646583, "learning_rate": 3.959960229548932e-06, "loss": 0.88674676, "num_input_tokens_seen": 22717945, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.171875, "step": 1063, "time_per_iteration": 2.493764638900757 }, { "auxiliary_loss_clip": 0.01152675, "auxiliary_loss_mlp": 0.01061486, "balance_loss_clip": 1.03192258, "balance_loss_mlp": 1.03749716, "epoch": 0.06397114083871938, "flos": 22089686192640.0, "grad_norm": 1.8895788334406478, "language_loss": 0.79841852, "learning_rate": 3.9598849718924456e-06, "loss": 0.82056022, "num_input_tokens_seen": 22736790, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.1484375, "step": 1064, "time_per_iteration": 2.552570343017578 }, { "auxiliary_loss_clip": 0.01156097, "auxiliary_loss_mlp": 0.01062561, "balance_loss_clip": 1.03197181, "balance_loss_mlp": 1.03786206, "epoch": 0.06403126409138735, "flos": 19571819961600.0, "grad_norm": 2.915547150127337, "language_loss": 0.84240711, "learning_rate": 3.9598096442927045e-06, "loss": 0.86459368, "num_input_tokens_seen": 22754745, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.1796875, "step": 1065, "time_per_iteration": 2.483699321746826 }, { "auxiliary_loss_clip": 0.01153172, "auxiliary_loss_mlp": 0.01055382, "balance_loss_clip": 1.02670002, "balance_loss_mlp": 1.03946292, "epoch": 0.06409138734405531, "flos": 40805820708480.0, "grad_norm": 2.1064109549280228, "language_loss": 0.68253148, "learning_rate": 3.959734246752399e-06, "loss": 0.70461702, "num_input_tokens_seen": 22776780, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.140625, "step": 1066, "time_per_iteration": 2.613372564315796 }, { "auxiliary_loss_clip": 0.01153307, "auxiliary_loss_mlp": 0.01068058, "balance_loss_clip": 1.03811312, "balance_loss_mlp": 1.03984094, "epoch": 0.06415151059672328, "flos": 20440773590400.0, "grad_norm": 2.221808413280424, "language_loss": 0.9024362, "learning_rate": 3.959658779274219e-06, "loss": 0.92464983, "num_input_tokens_seen": 22793915, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.1328125, "step": 1067, "time_per_iteration": 2.4078049659729004 }, { "auxiliary_loss_clip": 0.01153334, "auxiliary_loss_mlp": 0.01056416, "balance_loss_clip": 1.02794874, "balance_loss_mlp": 1.03792787, "epoch": 0.06421163384939126, "flos": 18071218281600.0, "grad_norm": 2.0953299155703515, "language_loss": 0.83557618, "learning_rate": 3.959583241860859e-06, "loss": 0.85767376, "num_input_tokens_seen": 22812670, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.15625, "step": 1068, "time_per_iteration": 2.43798565864563 }, { "auxiliary_loss_clip": 0.01149899, "auxiliary_loss_mlp": 0.01055215, "balance_loss_clip": 1.02734399, "balance_loss_mlp": 1.03755999, "epoch": 0.06427175710205922, "flos": 25118261925120.0, "grad_norm": 2.7684988106959607, "language_loss": 0.89493138, "learning_rate": 3.959507634515013e-06, "loss": 0.91698253, "num_input_tokens_seen": 22832440, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.125, "step": 1069, "time_per_iteration": 2.4779961109161377 }, { "auxiliary_loss_clip": 0.01156154, "auxiliary_loss_mlp": 0.01066524, "balance_loss_clip": 1.03642344, "balance_loss_mlp": 1.03853703, "epoch": 0.06433188035472719, "flos": 17379693515520.0, "grad_norm": 2.6462331204119565, "language_loss": 0.95468295, "learning_rate": 3.95943195723938e-06, "loss": 0.97690964, "num_input_tokens_seen": 22845495, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.171875, "step": 1070, "time_per_iteration": 2.40043306350708 }, { "auxiliary_loss_clip": 0.01154162, "auxiliary_loss_mlp": 0.01050874, "balance_loss_clip": 1.02147698, "balance_loss_mlp": 1.03742683, "epoch": 0.06439200360739517, "flos": 23545250352000.0, "grad_norm": 1.9726107770921453, "language_loss": 0.88081366, "learning_rate": 3.959356210036661e-06, "loss": 0.90286404, "num_input_tokens_seen": 22865390, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.171875, "step": 1071, "time_per_iteration": 2.4496536254882812 }, { "auxiliary_loss_clip": 0.01149584, "auxiliary_loss_mlp": 0.01051725, "balance_loss_clip": 1.02471256, "balance_loss_mlp": 1.03599989, "epoch": 0.06445212686006313, "flos": 21978732291840.0, "grad_norm": 1.9189707447936222, "language_loss": 0.76146531, "learning_rate": 3.959280392909559e-06, "loss": 0.78347838, "num_input_tokens_seen": 22885495, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.140625, "step": 1072, "time_per_iteration": 2.4486169815063477 }, { "auxiliary_loss_clip": 0.01156692, "auxiliary_loss_mlp": 0.01058945, "balance_loss_clip": 1.02768803, "balance_loss_mlp": 1.03829575, "epoch": 0.0645122501127311, "flos": 25920112187520.0, "grad_norm": 2.0618225953997027, "language_loss": 0.80716658, "learning_rate": 3.9592045058607785e-06, "loss": 0.82932299, "num_input_tokens_seen": 22904845, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.1875, "step": 1073, "time_per_iteration": 2.4605281352996826 }, { "auxiliary_loss_clip": 0.01144171, "auxiliary_loss_mlp": 0.01053222, "balance_loss_clip": 1.02463543, "balance_loss_mlp": 1.03527343, "epoch": 0.06457237336539907, "flos": 25624956620160.0, "grad_norm": 1.6866784670733426, "language_loss": 0.80415916, "learning_rate": 3.95912854889303e-06, "loss": 0.82613313, "num_input_tokens_seen": 22925940, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.09375, "step": 1074, "time_per_iteration": 2.4511263370513916 }, { "auxiliary_loss_clip": 0.01153531, "auxiliary_loss_mlp": 0.01052702, "balance_loss_clip": 1.0233289, "balance_loss_mlp": 1.03641522, "epoch": 0.06463249661806704, "flos": 19462960742400.0, "grad_norm": 2.532306893728656, "language_loss": 0.78886366, "learning_rate": 3.959052522009023e-06, "loss": 0.81092602, "num_input_tokens_seen": 22944375, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.171875, "step": 1075, "time_per_iteration": 2.4653217792510986 }, { "auxiliary_loss_clip": 0.01157863, "auxiliary_loss_mlp": 0.01054332, "balance_loss_clip": 1.02711701, "balance_loss_mlp": 1.03994346, "epoch": 0.064692619870735, "flos": 24496912725120.0, "grad_norm": 5.248740749478744, "language_loss": 0.87301528, "learning_rate": 3.95897642521147e-06, "loss": 0.89513719, "num_input_tokens_seen": 22959145, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.1796875, "step": 1076, "time_per_iteration": 2.558877944946289 }, { "auxiliary_loss_clip": 0.01149619, "auxiliary_loss_mlp": 0.01046069, "balance_loss_clip": 1.01819813, "balance_loss_mlp": 1.03587496, "epoch": 0.06475274312340297, "flos": 17017748784000.0, "grad_norm": 2.1191892004808404, "language_loss": 0.80661476, "learning_rate": 3.958900258503089e-06, "loss": 0.82857162, "num_input_tokens_seen": 22978100, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.140625, "step": 1077, "time_per_iteration": 2.5130233764648438 }, { "auxiliary_loss_clip": 0.01156957, "auxiliary_loss_mlp": 0.01062066, "balance_loss_clip": 1.0318104, "balance_loss_mlp": 1.03646731, "epoch": 0.06481286637607095, "flos": 24571207831680.0, "grad_norm": 2.564596832680389, "language_loss": 0.91844654, "learning_rate": 3.958824021886595e-06, "loss": 0.94063681, "num_input_tokens_seen": 22997285, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.203125, "step": 1078, "time_per_iteration": 2.5395843982696533 }, { "auxiliary_loss_clip": 0.01160052, "auxiliary_loss_mlp": 0.01057451, "balance_loss_clip": 1.02744603, "balance_loss_mlp": 1.03943264, "epoch": 0.06487298962873891, "flos": 21104576870400.0, "grad_norm": 2.0316399257948365, "language_loss": 0.78641224, "learning_rate": 3.9587477153647115e-06, "loss": 0.80858719, "num_input_tokens_seen": 23016285, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.203125, "step": 1079, "time_per_iteration": 2.4766886234283447 }, { "auxiliary_loss_clip": 0.01151624, "auxiliary_loss_mlp": 0.01057974, "balance_loss_clip": 1.02979302, "balance_loss_mlp": 1.0372957, "epoch": 0.06493311288140688, "flos": 24607028753280.0, "grad_norm": 2.63909668044762, "language_loss": 0.68948388, "learning_rate": 3.95867133894016e-06, "loss": 0.71157992, "num_input_tokens_seen": 23036420, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.140625, "step": 1080, "time_per_iteration": 3.8689091205596924 }, { "auxiliary_loss_clip": 0.01151384, "auxiliary_loss_mlp": 0.01055946, "balance_loss_clip": 1.02595294, "balance_loss_mlp": 1.03560901, "epoch": 0.06499323613407486, "flos": 25336818236160.0, "grad_norm": 1.7999405252280114, "language_loss": 0.72002423, "learning_rate": 3.958594892615667e-06, "loss": 0.7420975, "num_input_tokens_seen": 23056945, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.15625, "step": 1081, "time_per_iteration": 2.4564967155456543 }, { "auxiliary_loss_clip": 0.0114835, "auxiliary_loss_mlp": 0.01051298, "balance_loss_clip": 1.02166224, "balance_loss_mlp": 1.03567648, "epoch": 0.06505335938674282, "flos": 20374682653440.0, "grad_norm": 2.8791195051438643, "language_loss": 0.84015405, "learning_rate": 3.95851837639396e-06, "loss": 0.86215043, "num_input_tokens_seen": 23074940, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.125, "step": 1082, "time_per_iteration": 2.4006152153015137 }, { "auxiliary_loss_clip": 0.01159429, "auxiliary_loss_mlp": 0.01064826, "balance_loss_clip": 1.03422475, "balance_loss_mlp": 1.03717995, "epoch": 0.06511348263941079, "flos": 25336748413440.0, "grad_norm": 5.4199843251700655, "language_loss": 0.82377207, "learning_rate": 3.9584417902777695e-06, "loss": 0.84601462, "num_input_tokens_seen": 23093420, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.21875, "step": 1083, "time_per_iteration": 3.9749627113342285 }, { "auxiliary_loss_clip": 0.01156056, "auxiliary_loss_mlp": 0.01057279, "balance_loss_clip": 1.02689254, "balance_loss_mlp": 1.0394218, "epoch": 0.06517360589207877, "flos": 20331949282560.0, "grad_norm": 2.566877750876929, "language_loss": 0.79550064, "learning_rate": 3.95836513426983e-06, "loss": 0.81763399, "num_input_tokens_seen": 23111550, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.1640625, "step": 1084, "time_per_iteration": 2.466878890991211 }, { "auxiliary_loss_clip": 0.01152333, "auxiliary_loss_mlp": 0.01053444, "balance_loss_clip": 1.02485764, "balance_loss_mlp": 1.03761244, "epoch": 0.06523372914474673, "flos": 31680432339840.0, "grad_norm": 5.835992372464286, "language_loss": 0.66288763, "learning_rate": 3.958288408372877e-06, "loss": 0.68494546, "num_input_tokens_seen": 23130335, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.1484375, "step": 1085, "time_per_iteration": 3.9148972034454346 }, { "auxiliary_loss_clip": 0.01147609, "auxiliary_loss_mlp": 0.0105211, "balance_loss_clip": 1.02476358, "balance_loss_mlp": 1.03516364, "epoch": 0.0652938523974147, "flos": 20777091517440.0, "grad_norm": 2.121651067054262, "language_loss": 0.76523113, "learning_rate": 3.9582116125896474e-06, "loss": 0.78722835, "num_input_tokens_seen": 23152380, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.125, "step": 1086, "time_per_iteration": 2.4858388900756836 }, { "auxiliary_loss_clip": 0.01148721, "auxiliary_loss_mlp": 0.01047731, "balance_loss_clip": 1.0208497, "balance_loss_mlp": 1.03547835, "epoch": 0.06535397565008266, "flos": 16690053962880.0, "grad_norm": 3.2443909718870723, "language_loss": 0.85044527, "learning_rate": 3.958134746922882e-06, "loss": 0.87240976, "num_input_tokens_seen": 23171630, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.1328125, "step": 1087, "time_per_iteration": 2.390460252761841 }, { "auxiliary_loss_clip": 0.01147463, "auxiliary_loss_mlp": 0.01053357, "balance_loss_clip": 1.02568889, "balance_loss_mlp": 1.0346911, "epoch": 0.06541409890275064, "flos": 26867061527040.0, "grad_norm": 2.6455725952910427, "language_loss": 0.77596116, "learning_rate": 3.958057811375325e-06, "loss": 0.7979694, "num_input_tokens_seen": 23192520, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.125, "step": 1088, "time_per_iteration": 2.4677419662475586 }, { "auxiliary_loss_clip": 0.01149935, "auxiliary_loss_mlp": 0.01057884, "balance_loss_clip": 1.02992916, "balance_loss_mlp": 1.03685796, "epoch": 0.06547422215541861, "flos": 20520584691840.0, "grad_norm": 1.7317476980719246, "language_loss": 0.71197081, "learning_rate": 3.957980805949722e-06, "loss": 0.73404896, "num_input_tokens_seen": 23210710, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.125, "step": 1089, "time_per_iteration": 2.386992931365967 }, { "auxiliary_loss_clip": 0.01147423, "auxiliary_loss_mlp": 0.01050398, "balance_loss_clip": 1.02330232, "balance_loss_mlp": 1.0368315, "epoch": 0.06553434540808657, "flos": 22015565642880.0, "grad_norm": 1.8712122613700142, "language_loss": 0.85494733, "learning_rate": 3.957903730648819e-06, "loss": 0.87692559, "num_input_tokens_seen": 23230305, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.109375, "step": 1090, "time_per_iteration": 2.4183759689331055 }, { "auxiliary_loss_clip": 0.01153315, "auxiliary_loss_mlp": 0.01055158, "balance_loss_clip": 1.02670288, "balance_loss_mlp": 1.03855705, "epoch": 0.06559446866075455, "flos": 24607482600960.0, "grad_norm": 2.0463246774747117, "language_loss": 0.71929127, "learning_rate": 3.957826585475369e-06, "loss": 0.74137598, "num_input_tokens_seen": 23249015, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.1484375, "step": 1091, "time_per_iteration": 2.446702241897583 }, { "auxiliary_loss_clip": 0.01148337, "auxiliary_loss_mlp": 0.01053979, "balance_loss_clip": 1.02652502, "balance_loss_mlp": 1.03667808, "epoch": 0.06565459191342252, "flos": 24273678291840.0, "grad_norm": 2.577704198220226, "language_loss": 0.82610309, "learning_rate": 3.957749370432124e-06, "loss": 0.84812617, "num_input_tokens_seen": 23265105, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.1171875, "step": 1092, "time_per_iteration": 2.3923897743225098 }, { "auxiliary_loss_clip": 0.01152296, "auxiliary_loss_mlp": 0.01055245, "balance_loss_clip": 1.02546632, "balance_loss_mlp": 1.03611541, "epoch": 0.06571471516609048, "flos": 24786063538560.0, "grad_norm": 1.8958847964951662, "language_loss": 0.7130363, "learning_rate": 3.957672085521841e-06, "loss": 0.73511177, "num_input_tokens_seen": 23283950, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.15625, "step": 1093, "time_per_iteration": 2.4138355255126953 }, { "auxiliary_loss_clip": 0.01149541, "auxiliary_loss_mlp": 0.01052042, "balance_loss_clip": 1.02276444, "balance_loss_mlp": 1.03724301, "epoch": 0.06577483841875846, "flos": 26212858871040.0, "grad_norm": 1.6711946765405614, "language_loss": 0.87978733, "learning_rate": 3.957594730747276e-06, "loss": 0.90180314, "num_input_tokens_seen": 23305005, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.125, "step": 1094, "time_per_iteration": 2.4436089992523193 }, { "auxiliary_loss_clip": 0.01151625, "auxiliary_loss_mlp": 0.0105448, "balance_loss_clip": 1.02482069, "balance_loss_mlp": 1.037292, "epoch": 0.06583496167142643, "flos": 25079683006080.0, "grad_norm": 2.2043409811576806, "language_loss": 0.81170315, "learning_rate": 3.957517306111191e-06, "loss": 0.8337642, "num_input_tokens_seen": 23323220, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.140625, "step": 1095, "time_per_iteration": 2.4198005199432373 }, { "auxiliary_loss_clip": 0.0114609, "auxiliary_loss_mlp": 0.01048581, "balance_loss_clip": 1.02168703, "balance_loss_mlp": 1.03538322, "epoch": 0.06589508492409439, "flos": 25628622312960.0, "grad_norm": 2.126074922761706, "language_loss": 0.6998198, "learning_rate": 3.957439811616349e-06, "loss": 0.72176647, "num_input_tokens_seen": 23342235, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.109375, "step": 1096, "time_per_iteration": 2.4462523460388184 }, { "auxiliary_loss_clip": 0.01152339, "auxiliary_loss_mlp": 0.01050908, "balance_loss_clip": 1.02412224, "balance_loss_mlp": 1.039469, "epoch": 0.06595520817676236, "flos": 23620173863040.0, "grad_norm": 1.8544065519083277, "language_loss": 0.77033997, "learning_rate": 3.957362247265515e-06, "loss": 0.79237241, "num_input_tokens_seen": 23363680, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 1.125, "step": 1097, "time_per_iteration": 2.40742564201355 }, { "auxiliary_loss_clip": 0.0115117, "auxiliary_loss_mlp": 0.0106126, "balance_loss_clip": 1.03356814, "balance_loss_mlp": 1.03739858, "epoch": 0.06601533142943034, "flos": 33800323449600.0, "grad_norm": 1.966380454277295, "language_loss": 0.78213745, "learning_rate": 3.957284613061456e-06, "loss": 0.80426174, "num_input_tokens_seen": 23385590, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.140625, "step": 1098, "time_per_iteration": 2.5256941318511963 }, { "auxiliary_loss_clip": 0.01150204, "auxiliary_loss_mlp": 0.01060173, "balance_loss_clip": 1.03039408, "balance_loss_mlp": 1.03755939, "epoch": 0.0660754546820983, "flos": 20258352403200.0, "grad_norm": 4.2821213113645795, "language_loss": 0.81474102, "learning_rate": 3.957206909006945e-06, "loss": 0.8368448, "num_input_tokens_seen": 23402945, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.125, "step": 1099, "time_per_iteration": 2.382452964782715 }, { "auxiliary_loss_clip": 0.01142698, "auxiliary_loss_mlp": 0.01050563, "balance_loss_clip": 1.02282298, "balance_loss_mlp": 1.03261256, "epoch": 0.06613557793476627, "flos": 19353158916480.0, "grad_norm": 3.173236474032098, "language_loss": 0.82873213, "learning_rate": 3.957129135104754e-06, "loss": 0.85066473, "num_input_tokens_seen": 23421410, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.1015625, "step": 1100, "time_per_iteration": 2.41461443901062 }, { "auxiliary_loss_clip": 0.01149334, "auxiliary_loss_mlp": 0.01056582, "balance_loss_clip": 1.02933121, "balance_loss_mlp": 1.03661847, "epoch": 0.06619570118743424, "flos": 13771698992640.0, "grad_norm": 2.412003980507769, "language_loss": 0.73175687, "learning_rate": 3.957051291357658e-06, "loss": 0.75381601, "num_input_tokens_seen": 23438870, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.125, "step": 1101, "time_per_iteration": 2.445695638656616 }, { "auxiliary_loss_clip": 0.01146847, "auxiliary_loss_mlp": 0.01058545, "balance_loss_clip": 1.02983987, "balance_loss_mlp": 1.0356338, "epoch": 0.06625582444010221, "flos": 17856921156480.0, "grad_norm": 2.432250688453231, "language_loss": 0.85938394, "learning_rate": 3.956973377768437e-06, "loss": 0.8814379, "num_input_tokens_seen": 23456975, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.109375, "step": 1102, "time_per_iteration": 2.3959429264068604 }, { "auxiliary_loss_clip": 0.01148477, "auxiliary_loss_mlp": 0.01051257, "balance_loss_clip": 1.02208626, "balance_loss_mlp": 1.03743041, "epoch": 0.06631594769277017, "flos": 11837894762880.0, "grad_norm": 4.580157305379214, "language_loss": 0.81804848, "learning_rate": 3.956895394339869e-06, "loss": 0.84004581, "num_input_tokens_seen": 23473440, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.109375, "step": 1103, "time_per_iteration": 2.3681488037109375 }, { "auxiliary_loss_clip": 0.01150511, "auxiliary_loss_mlp": 0.01060027, "balance_loss_clip": 1.03302574, "balance_loss_mlp": 1.03946304, "epoch": 0.06637607094543815, "flos": 19792296397440.0, "grad_norm": 1.8496289460402604, "language_loss": 0.81953788, "learning_rate": 3.956817341074738e-06, "loss": 0.84164321, "num_input_tokens_seen": 23493880, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.109375, "step": 1104, "time_per_iteration": 2.4122419357299805 }, { "auxiliary_loss_clip": 0.01143497, "auxiliary_loss_mlp": 0.01047648, "balance_loss_clip": 1.0187993, "balance_loss_mlp": 1.03398204, "epoch": 0.06643619419810612, "flos": 25484430931200.0, "grad_norm": 1.8390153581956532, "language_loss": 0.80658793, "learning_rate": 3.95673921797583e-06, "loss": 0.82849944, "num_input_tokens_seen": 23514920, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.09375, "step": 1105, "time_per_iteration": 2.446779727935791 }, { "auxiliary_loss_clip": 0.01144253, "auxiliary_loss_mlp": 0.01052386, "balance_loss_clip": 1.02591014, "balance_loss_mlp": 1.0359776, "epoch": 0.06649631745077408, "flos": 16945583270400.0, "grad_norm": 2.004951132118422, "language_loss": 0.96369636, "learning_rate": 3.956661025045933e-06, "loss": 0.98566276, "num_input_tokens_seen": 23531635, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.0859375, "step": 1106, "time_per_iteration": 2.384737014770508 }, { "auxiliary_loss_clip": 0.01149602, "auxiliary_loss_mlp": 0.0104897, "balance_loss_clip": 1.02033615, "balance_loss_mlp": 1.03467488, "epoch": 0.06655644070344206, "flos": 17857619383680.0, "grad_norm": 3.070621473217749, "language_loss": 0.8192116, "learning_rate": 3.9565827622878365e-06, "loss": 0.84119731, "num_input_tokens_seen": 23551020, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.15625, "step": 1107, "time_per_iteration": 2.3848822116851807 }, { "auxiliary_loss_clip": 0.01043134, "auxiliary_loss_mlp": 0.01023949, "balance_loss_clip": 1.01925182, "balance_loss_mlp": 1.00925303, "epoch": 0.06661656395611003, "flos": 61416236062080.0, "grad_norm": 0.7916552539491276, "language_loss": 0.56714582, "learning_rate": 3.956504429704334e-06, "loss": 0.5878166, "num_input_tokens_seen": 23610675, "router_z_loss_clip": 0.046875, "router_z_loss_mlp": 0.33984375, "step": 1108, "time_per_iteration": 2.987449884414673 }, { "auxiliary_loss_clip": 0.01147292, "auxiliary_loss_mlp": 0.01054553, "balance_loss_clip": 1.0233202, "balance_loss_mlp": 1.03442478, "epoch": 0.066676687208778, "flos": 20661948253440.0, "grad_norm": 3.414117940188091, "language_loss": 0.72846961, "learning_rate": 3.956426027298221e-06, "loss": 0.7504881, "num_input_tokens_seen": 23628710, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.125, "step": 1109, "time_per_iteration": 2.4279117584228516 }, { "auxiliary_loss_clip": 0.01147421, "auxiliary_loss_mlp": 0.01054663, "balance_loss_clip": 1.02586257, "balance_loss_mlp": 1.03525794, "epoch": 0.06673681046144596, "flos": 20922225505920.0, "grad_norm": 2.0648281320744832, "language_loss": 0.7821449, "learning_rate": 3.956347555072296e-06, "loss": 0.80416572, "num_input_tokens_seen": 23649160, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.125, "step": 1110, "time_per_iteration": 2.421630382537842 }, { "auxiliary_loss_clip": 0.01148098, "auxiliary_loss_mlp": 0.01051408, "balance_loss_clip": 1.0239898, "balance_loss_mlp": 1.03642201, "epoch": 0.06679693371411394, "flos": 31064494400640.0, "grad_norm": 3.521541261242907, "language_loss": 0.71108806, "learning_rate": 3.95626901302936e-06, "loss": 0.73308313, "num_input_tokens_seen": 23671995, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.1171875, "step": 1111, "time_per_iteration": 2.4702768325805664 }, { "auxiliary_loss_clip": 0.01152294, "auxiliary_loss_mlp": 0.01052218, "balance_loss_clip": 1.02451348, "balance_loss_mlp": 1.03722906, "epoch": 0.0668570569667819, "flos": 21725053286400.0, "grad_norm": 2.0572766136120872, "language_loss": 0.78350592, "learning_rate": 3.956190401172214e-06, "loss": 0.80555105, "num_input_tokens_seen": 23690705, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.1484375, "step": 1112, "time_per_iteration": 2.4014744758605957 }, { "auxiliary_loss_clip": 0.01148717, "auxiliary_loss_mlp": 0.01057763, "balance_loss_clip": 1.0297966, "balance_loss_mlp": 1.03698647, "epoch": 0.06691718021944987, "flos": 22746158087040.0, "grad_norm": 2.2200154515730315, "language_loss": 0.79009718, "learning_rate": 3.956111719503664e-06, "loss": 0.81216192, "num_input_tokens_seen": 23709990, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.1171875, "step": 1113, "time_per_iteration": 2.4003682136535645 }, { "auxiliary_loss_clip": 0.01143582, "auxiliary_loss_mlp": 0.01045722, "balance_loss_clip": 1.01901865, "balance_loss_mlp": 1.03396714, "epoch": 0.06697730347211785, "flos": 16544675594880.0, "grad_norm": 1.8213310860122236, "language_loss": 0.82533109, "learning_rate": 3.956032968026519e-06, "loss": 0.84722418, "num_input_tokens_seen": 23728485, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 1.09375, "step": 1114, "time_per_iteration": 2.3902881145477295 }, { "auxiliary_loss_clip": 0.01045519, "auxiliary_loss_mlp": 0.01003938, "balance_loss_clip": 0.99950367, "balance_loss_mlp": 1.0111022, "epoch": 0.06703742672478581, "flos": 59779123499520.0, "grad_norm": 0.8243039787841735, "language_loss": 0.58152986, "learning_rate": 3.955954146743589e-06, "loss": 0.60202444, "num_input_tokens_seen": 23786650, "router_z_loss_clip": 0.04443359, "router_z_loss_mlp": 0.34375, "step": 1115, "time_per_iteration": 2.975740432739258 }, { "auxiliary_loss_clip": 0.01148229, "auxiliary_loss_mlp": 0.01056833, "balance_loss_clip": 1.02709055, "balance_loss_mlp": 1.03565681, "epoch": 0.06709754997745378, "flos": 16799262295680.0, "grad_norm": 3.130615164771175, "language_loss": 0.9187237, "learning_rate": 3.9558752556576874e-06, "loss": 0.94077432, "num_input_tokens_seen": 23802555, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.125, "step": 1116, "time_per_iteration": 2.3603169918060303 }, { "auxiliary_loss_clip": 0.01154845, "auxiliary_loss_mlp": 0.01060873, "balance_loss_clip": 1.03114212, "balance_loss_mlp": 1.03859985, "epoch": 0.06715767323012176, "flos": 22122923673600.0, "grad_norm": 2.095241715439275, "language_loss": 0.87228984, "learning_rate": 3.955796294771628e-06, "loss": 0.89444697, "num_input_tokens_seen": 23822945, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.1640625, "step": 1117, "time_per_iteration": 2.437349319458008 }, { "auxiliary_loss_clip": 0.01041756, "auxiliary_loss_mlp": 0.01006534, "balance_loss_clip": 1.00224292, "balance_loss_mlp": 1.00733614, "epoch": 0.06721779648278972, "flos": 66615363020160.0, "grad_norm": 0.8524745008189767, "language_loss": 0.59762853, "learning_rate": 3.95571726408823e-06, "loss": 0.61811143, "num_input_tokens_seen": 23874075, "router_z_loss_clip": 0.04296875, "router_z_loss_mlp": 0.34375, "step": 1118, "time_per_iteration": 3.0528414249420166 }, { "auxiliary_loss_clip": 0.01146149, "auxiliary_loss_mlp": 0.01046751, "balance_loss_clip": 1.01955914, "balance_loss_mlp": 1.03465438, "epoch": 0.06727791973545769, "flos": 22381385535360.0, "grad_norm": 6.487344051059983, "language_loss": 0.82986391, "learning_rate": 3.955638163610314e-06, "loss": 0.85179293, "num_input_tokens_seen": 23889720, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.1171875, "step": 1119, "time_per_iteration": 2.419816732406616 }, { "auxiliary_loss_clip": 0.01144101, "auxiliary_loss_mlp": 0.01050539, "balance_loss_clip": 1.02450359, "balance_loss_mlp": 1.03527403, "epoch": 0.06733804298812565, "flos": 24279054641280.0, "grad_norm": 1.8616435906553814, "language_loss": 0.8482362, "learning_rate": 3.955558993340703e-06, "loss": 0.87018257, "num_input_tokens_seen": 23909385, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.09375, "step": 1120, "time_per_iteration": 3.8828959465026855 }, { "auxiliary_loss_clip": 0.01156273, "auxiliary_loss_mlp": 0.01058698, "balance_loss_clip": 1.03104162, "balance_loss_mlp": 1.0411582, "epoch": 0.06739816624079363, "flos": 15917496197760.0, "grad_norm": 2.2660236821839623, "language_loss": 0.78819853, "learning_rate": 3.955479753282221e-06, "loss": 0.81034827, "num_input_tokens_seen": 23926830, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.1484375, "step": 1121, "time_per_iteration": 2.4081673622131348 }, { "auxiliary_loss_clip": 0.01039947, "auxiliary_loss_mlp": 0.01005861, "balance_loss_clip": 1.00168824, "balance_loss_mlp": 1.00650489, "epoch": 0.0674582894934616, "flos": 71394656613120.0, "grad_norm": 0.7533590153096971, "language_loss": 0.58349454, "learning_rate": 3.955400443437696e-06, "loss": 0.60395265, "num_input_tokens_seen": 23992640, "router_z_loss_clip": 0.04174805, "router_z_loss_mlp": 0.3359375, "step": 1122, "time_per_iteration": 4.462320566177368 }, { "auxiliary_loss_clip": 0.0115262, "auxiliary_loss_mlp": 0.01052663, "balance_loss_clip": 1.02430344, "balance_loss_mlp": 1.03925991, "epoch": 0.06751841274612956, "flos": 25263779938560.0, "grad_norm": 2.039678532660783, "language_loss": 0.71565163, "learning_rate": 3.95532106380996e-06, "loss": 0.7377044, "num_input_tokens_seen": 24011135, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.1328125, "step": 1123, "time_per_iteration": 3.810786724090576 }, { "auxiliary_loss_clip": 0.01150952, "auxiliary_loss_mlp": 0.0105428, "balance_loss_clip": 1.02476323, "balance_loss_mlp": 1.03756046, "epoch": 0.06757853599879754, "flos": 23801687354880.0, "grad_norm": 1.86152894512166, "language_loss": 0.79015303, "learning_rate": 3.9552416144018445e-06, "loss": 0.81220531, "num_input_tokens_seen": 24030695, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.1328125, "step": 1124, "time_per_iteration": 2.4354350566864014 }, { "auxiliary_loss_clip": 0.01145399, "auxiliary_loss_mlp": 0.01044236, "balance_loss_clip": 1.01797402, "balance_loss_mlp": 1.0362289, "epoch": 0.0676386592514655, "flos": 21032655736320.0, "grad_norm": 2.7282165640532234, "language_loss": 0.71316373, "learning_rate": 3.955162095216186e-06, "loss": 0.7350601, "num_input_tokens_seen": 24050680, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.09375, "step": 1125, "time_per_iteration": 3.7452244758605957 }, { "auxiliary_loss_clip": 0.01145681, "auxiliary_loss_mlp": 0.01052374, "balance_loss_clip": 1.02295339, "balance_loss_mlp": 1.03685808, "epoch": 0.06769878250413347, "flos": 25555165079040.0, "grad_norm": 2.6751849311736544, "language_loss": 0.81167436, "learning_rate": 3.95508250625582e-06, "loss": 0.83365488, "num_input_tokens_seen": 24067205, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.0859375, "step": 1126, "time_per_iteration": 2.447014331817627 }, { "auxiliary_loss_clip": 0.01040105, "auxiliary_loss_mlp": 0.01005089, "balance_loss_clip": 1.00074983, "balance_loss_mlp": 1.00653863, "epoch": 0.06775890575680145, "flos": 70651740458880.0, "grad_norm": 0.7816526170598117, "language_loss": 0.59801042, "learning_rate": 3.95500284752359e-06, "loss": 0.61846232, "num_input_tokens_seen": 24131320, "router_z_loss_clip": 0.04345703, "router_z_loss_mlp": 0.3359375, "step": 1127, "time_per_iteration": 3.066779851913452 }, { "auxiliary_loss_clip": 0.01147014, "auxiliary_loss_mlp": 0.01048138, "balance_loss_clip": 1.02029109, "balance_loss_mlp": 1.03688431, "epoch": 0.06781902900946941, "flos": 24234575702400.0, "grad_norm": 2.224979259672447, "language_loss": 0.81246132, "learning_rate": 3.954923119022337e-06, "loss": 0.83441287, "num_input_tokens_seen": 24149930, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.1015625, "step": 1128, "time_per_iteration": 2.4281606674194336 }, { "auxiliary_loss_clip": 0.01154245, "auxiliary_loss_mlp": 0.01046281, "balance_loss_clip": 1.01767111, "balance_loss_mlp": 1.03841734, "epoch": 0.06787915226213738, "flos": 22416473318400.0, "grad_norm": 2.7069058363169156, "language_loss": 0.75399923, "learning_rate": 3.9548433207549065e-06, "loss": 0.77600449, "num_input_tokens_seen": 24169590, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.15625, "step": 1129, "time_per_iteration": 2.4316508769989014 }, { "auxiliary_loss_clip": 0.01145738, "auxiliary_loss_mlp": 0.01046612, "balance_loss_clip": 1.01823974, "balance_loss_mlp": 1.03559637, "epoch": 0.06793927551480534, "flos": 37705393664640.0, "grad_norm": 1.7740383949338567, "language_loss": 0.71722078, "learning_rate": 3.954763452724146e-06, "loss": 0.73914433, "num_input_tokens_seen": 24189965, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.09375, "step": 1130, "time_per_iteration": 2.5574941635131836 }, { "auxiliary_loss_clip": 0.01145337, "auxiliary_loss_mlp": 0.01049854, "balance_loss_clip": 1.02341366, "balance_loss_mlp": 1.03672767, "epoch": 0.06799939876747332, "flos": 20630351606400.0, "grad_norm": 2.5622001697638903, "language_loss": 0.80953151, "learning_rate": 3.954683514932906e-06, "loss": 0.83148336, "num_input_tokens_seen": 24208045, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.0859375, "step": 1131, "time_per_iteration": 2.46608567237854 }, { "auxiliary_loss_clip": 0.0114364, "auxiliary_loss_mlp": 0.0105958, "balance_loss_clip": 1.0310415, "balance_loss_mlp": 1.0365063, "epoch": 0.06805952202014129, "flos": 14863921966080.0, "grad_norm": 10.230628435090006, "language_loss": 0.80578613, "learning_rate": 3.95460350738404e-06, "loss": 0.82781839, "num_input_tokens_seen": 24223805, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.0703125, "step": 1132, "time_per_iteration": 2.406174421310425 }, { "auxiliary_loss_clip": 0.01144118, "auxiliary_loss_mlp": 0.01054145, "balance_loss_clip": 1.02660751, "balance_loss_mlp": 1.03547812, "epoch": 0.06811964527280925, "flos": 48907555747200.0, "grad_norm": 1.5189189589491072, "language_loss": 0.63690358, "learning_rate": 3.954523430080402e-06, "loss": 0.65888619, "num_input_tokens_seen": 24249475, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.0859375, "step": 1133, "time_per_iteration": 2.6687326431274414 }, { "auxiliary_loss_clip": 0.01150547, "auxiliary_loss_mlp": 0.01053353, "balance_loss_clip": 1.02378917, "balance_loss_mlp": 1.03586221, "epoch": 0.06817976852547723, "flos": 15376377035520.0, "grad_norm": 2.2196303995449114, "language_loss": 0.74988973, "learning_rate": 3.9544432830248504e-06, "loss": 0.77192879, "num_input_tokens_seen": 24267980, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.1484375, "step": 1134, "time_per_iteration": 2.389263391494751 }, { "auxiliary_loss_clip": 0.01144706, "auxiliary_loss_mlp": 0.01052305, "balance_loss_clip": 1.02647233, "balance_loss_mlp": 1.03708625, "epoch": 0.0682398917781452, "flos": 20154694976640.0, "grad_norm": 3.295505603148525, "language_loss": 0.8708508, "learning_rate": 3.954363066220246e-06, "loss": 0.89282089, "num_input_tokens_seen": 24286805, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.078125, "step": 1135, "time_per_iteration": 2.395012378692627 }, { "auxiliary_loss_clip": 0.01146483, "auxiliary_loss_mlp": 0.01045769, "balance_loss_clip": 1.01824403, "balance_loss_mlp": 1.03473854, "epoch": 0.06830001503081316, "flos": 23439498243840.0, "grad_norm": 2.9357431467981527, "language_loss": 0.77959895, "learning_rate": 3.954282779669451e-06, "loss": 0.80152142, "num_input_tokens_seen": 24305855, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.1171875, "step": 1136, "time_per_iteration": 2.437164783477783 }, { "auxiliary_loss_clip": 0.01149479, "auxiliary_loss_mlp": 0.01056481, "balance_loss_clip": 1.02828848, "balance_loss_mlp": 1.0376966, "epoch": 0.06836013828348114, "flos": 34348389972480.0, "grad_norm": 10.53788335757046, "language_loss": 0.83737171, "learning_rate": 3.95420242337533e-06, "loss": 0.85943127, "num_input_tokens_seen": 24326535, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.1171875, "step": 1137, "time_per_iteration": 2.509925127029419 }, { "auxiliary_loss_clip": 0.01143737, "auxiliary_loss_mlp": 0.01049197, "balance_loss_clip": 1.02167106, "balance_loss_mlp": 1.0355711, "epoch": 0.06842026153614911, "flos": 23147729078400.0, "grad_norm": 2.548998429243754, "language_loss": 0.78280199, "learning_rate": 3.954121997340752e-06, "loss": 0.80473137, "num_input_tokens_seen": 24345810, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.078125, "step": 1138, "time_per_iteration": 2.451308012008667 }, { "auxiliary_loss_clip": 0.01147673, "auxiliary_loss_mlp": 0.01060586, "balance_loss_clip": 1.03071165, "balance_loss_mlp": 1.03628469, "epoch": 0.06848038478881707, "flos": 24607796803200.0, "grad_norm": 2.4230119432478325, "language_loss": 0.85318613, "learning_rate": 3.9540415015685855e-06, "loss": 0.8752687, "num_input_tokens_seen": 24366095, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.109375, "step": 1139, "time_per_iteration": 2.418158769607544 }, { "auxiliary_loss_clip": 0.01145708, "auxiliary_loss_mlp": 0.01045786, "balance_loss_clip": 1.01917887, "balance_loss_mlp": 1.0365721, "epoch": 0.06854050804148504, "flos": 40879382676480.0, "grad_norm": 1.802646008694186, "language_loss": 0.74583817, "learning_rate": 3.953960936061706e-06, "loss": 0.76775312, "num_input_tokens_seen": 24388665, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.09375, "step": 1140, "time_per_iteration": 2.5871405601501465 }, { "auxiliary_loss_clip": 0.01144081, "auxiliary_loss_mlp": 0.01060288, "balance_loss_clip": 1.02993786, "balance_loss_mlp": 1.03546023, "epoch": 0.06860063129415302, "flos": 31685005728000.0, "grad_norm": 2.353283521657733, "language_loss": 0.6831162, "learning_rate": 3.9538803008229845e-06, "loss": 0.70515984, "num_input_tokens_seen": 24407705, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.0859375, "step": 1141, "time_per_iteration": 2.4802167415618896 }, { "auxiliary_loss_clip": 0.01147212, "auxiliary_loss_mlp": 0.01055887, "balance_loss_clip": 1.02670479, "balance_loss_mlp": 1.03654337, "epoch": 0.06866075454682098, "flos": 26540798071680.0, "grad_norm": 2.4042942679845396, "language_loss": 0.78867722, "learning_rate": 3.953799595855303e-06, "loss": 0.81070817, "num_input_tokens_seen": 24428390, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.109375, "step": 1142, "time_per_iteration": 2.4773917198181152 }, { "auxiliary_loss_clip": 0.01144319, "auxiliary_loss_mlp": 0.01050934, "balance_loss_clip": 1.02454114, "balance_loss_mlp": 1.03604782, "epoch": 0.06872087779948895, "flos": 29788453785600.0, "grad_norm": 1.8654347049738194, "language_loss": 0.6836428, "learning_rate": 3.953718821161539e-06, "loss": 0.70559537, "num_input_tokens_seen": 24450810, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.0859375, "step": 1143, "time_per_iteration": 2.5048325061798096 }, { "auxiliary_loss_clip": 0.01138427, "auxiliary_loss_mlp": 0.01050695, "balance_loss_clip": 1.02438569, "balance_loss_mlp": 1.03550994, "epoch": 0.06878100105215693, "flos": 26939960179200.0, "grad_norm": 1.722226436663597, "language_loss": 0.74243826, "learning_rate": 3.953637976744576e-06, "loss": 0.76432949, "num_input_tokens_seen": 24469965, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.03125, "step": 1144, "time_per_iteration": 2.498687267303467 }, { "auxiliary_loss_clip": 0.01146537, "auxiliary_loss_mlp": 0.01055603, "balance_loss_clip": 1.02723074, "balance_loss_mlp": 1.03431416, "epoch": 0.06884112430482489, "flos": 10669980228480.0, "grad_norm": 4.7915259833189205, "language_loss": 0.9168638, "learning_rate": 3.953557062607299e-06, "loss": 0.93888521, "num_input_tokens_seen": 24486370, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.125, "step": 1145, "time_per_iteration": 2.358703136444092 }, { "auxiliary_loss_clip": 0.01150496, "auxiliary_loss_mlp": 0.01056676, "balance_loss_clip": 1.02692151, "balance_loss_mlp": 1.03589928, "epoch": 0.06890124755749286, "flos": 20192610579840.0, "grad_norm": 2.3528228070886286, "language_loss": 0.81935954, "learning_rate": 3.953476078752595e-06, "loss": 0.84143126, "num_input_tokens_seen": 24503780, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.140625, "step": 1146, "time_per_iteration": 2.4289767742156982 }, { "auxiliary_loss_clip": 0.01142062, "auxiliary_loss_mlp": 0.01057507, "balance_loss_clip": 1.0310905, "balance_loss_mlp": 1.03638935, "epoch": 0.06896137081016084, "flos": 20448174798720.0, "grad_norm": 2.3180790737616364, "language_loss": 0.84927756, "learning_rate": 3.953395025183355e-06, "loss": 0.87127328, "num_input_tokens_seen": 24522320, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.0625, "step": 1147, "time_per_iteration": 2.3913071155548096 }, { "auxiliary_loss_clip": 0.01145703, "auxiliary_loss_mlp": 0.01050701, "balance_loss_clip": 1.02339029, "balance_loss_mlp": 1.03590751, "epoch": 0.0690214940628288, "flos": 18367735392000.0, "grad_norm": 1.9433216530482342, "language_loss": 0.85627848, "learning_rate": 3.9533139019024715e-06, "loss": 0.87824255, "num_input_tokens_seen": 24540445, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.09375, "step": 1148, "time_per_iteration": 2.3985259532928467 }, { "auxiliary_loss_clip": 0.01142808, "auxiliary_loss_mlp": 0.01051462, "balance_loss_clip": 1.02472377, "balance_loss_mlp": 1.0347476, "epoch": 0.06908161731549677, "flos": 20556999106560.0, "grad_norm": 2.5802358550337074, "language_loss": 0.69454765, "learning_rate": 3.953232708912839e-06, "loss": 0.71649039, "num_input_tokens_seen": 24557105, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 1.078125, "step": 1149, "time_per_iteration": 2.38120436668396 }, { "auxiliary_loss_clip": 0.01148519, "auxiliary_loss_mlp": 0.01048599, "balance_loss_clip": 1.02033401, "balance_loss_mlp": 1.03614509, "epoch": 0.06914174056816474, "flos": 27562426542720.0, "grad_norm": 2.0275636731663966, "language_loss": 0.83030009, "learning_rate": 3.953151446217356e-06, "loss": 0.85227126, "num_input_tokens_seen": 24578240, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.125, "step": 1150, "time_per_iteration": 2.4575576782226562 }, { "auxiliary_loss_clip": 0.01150229, "auxiliary_loss_mlp": 0.01055713, "balance_loss_clip": 1.02748466, "balance_loss_mlp": 1.03936958, "epoch": 0.06920186382083271, "flos": 15303129269760.0, "grad_norm": 3.4158487911616255, "language_loss": 0.8146646, "learning_rate": 3.953070113818921e-06, "loss": 0.83672404, "num_input_tokens_seen": 24593585, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.109375, "step": 1151, "time_per_iteration": 2.356996774673462 }, { "auxiliary_loss_clip": 0.01142556, "auxiliary_loss_mlp": 0.01047837, "balance_loss_clip": 1.0213964, "balance_loss_mlp": 1.03563929, "epoch": 0.06926198707350067, "flos": 25190078325120.0, "grad_norm": 2.1900678702636296, "language_loss": 0.85472023, "learning_rate": 3.952988711720439e-06, "loss": 0.87662417, "num_input_tokens_seen": 24613110, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.0625, "step": 1152, "time_per_iteration": 2.472827196121216 }, { "auxiliary_loss_clip": 0.01142016, "auxiliary_loss_mlp": 0.01049989, "balance_loss_clip": 1.02365518, "balance_loss_mlp": 1.03515291, "epoch": 0.06932211032616864, "flos": 13255438584960.0, "grad_norm": 1.9839593451496706, "language_loss": 0.90736151, "learning_rate": 3.952907239924813e-06, "loss": 0.92928159, "num_input_tokens_seen": 24628795, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.0703125, "step": 1153, "time_per_iteration": 2.3746461868286133 }, { "auxiliary_loss_clip": 0.01144548, "auxiliary_loss_mlp": 0.01048424, "balance_loss_clip": 1.01949143, "balance_loss_mlp": 1.035882, "epoch": 0.06938223357883662, "flos": 24826213468800.0, "grad_norm": 2.2986110293559463, "language_loss": 0.81671846, "learning_rate": 3.95282569843495e-06, "loss": 0.83864814, "num_input_tokens_seen": 24645480, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.09375, "step": 1154, "time_per_iteration": 2.451007843017578 }, { "auxiliary_loss_clip": 0.01141782, "auxiliary_loss_mlp": 0.01054789, "balance_loss_clip": 1.02863431, "balance_loss_mlp": 1.03702188, "epoch": 0.06944235683150458, "flos": 27266852039040.0, "grad_norm": 1.8435908644227317, "language_loss": 0.75050694, "learning_rate": 3.952744087253762e-06, "loss": 0.77247268, "num_input_tokens_seen": 24664630, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.046875, "step": 1155, "time_per_iteration": 2.449211597442627 }, { "auxiliary_loss_clip": 0.01141457, "auxiliary_loss_mlp": 0.01045484, "balance_loss_clip": 1.01813757, "balance_loss_mlp": 1.0339638, "epoch": 0.06950248008417255, "flos": 25806993782400.0, "grad_norm": 1.809875608216508, "language_loss": 0.70478129, "learning_rate": 3.952662406384161e-06, "loss": 0.72665071, "num_input_tokens_seen": 24684210, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.078125, "step": 1156, "time_per_iteration": 2.47178316116333 }, { "auxiliary_loss_clip": 0.01144698, "auxiliary_loss_mlp": 0.01056941, "balance_loss_clip": 1.02679288, "balance_loss_mlp": 1.03540492, "epoch": 0.06956260333684053, "flos": 22270501457280.0, "grad_norm": 2.042998869215872, "language_loss": 0.75011253, "learning_rate": 3.952580655829061e-06, "loss": 0.77212894, "num_input_tokens_seen": 24702490, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.09375, "step": 1157, "time_per_iteration": 2.4180266857147217 }, { "auxiliary_loss_clip": 0.01144855, "auxiliary_loss_mlp": 0.01055636, "balance_loss_clip": 1.02833712, "balance_loss_mlp": 1.03508615, "epoch": 0.0696227265895085, "flos": 29680048414080.0, "grad_norm": 1.9123471701501298, "language_loss": 0.71525955, "learning_rate": 3.952498835591381e-06, "loss": 0.73726451, "num_input_tokens_seen": 24724340, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.09375, "step": 1158, "time_per_iteration": 2.50199031829834 }, { "auxiliary_loss_clip": 0.01144933, "auxiliary_loss_mlp": 0.0105089, "balance_loss_clip": 1.0223515, "balance_loss_mlp": 1.03502059, "epoch": 0.06968284984217646, "flos": 25522276711680.0, "grad_norm": 1.8495643318440533, "language_loss": 0.79798836, "learning_rate": 3.952416945674039e-06, "loss": 0.81994659, "num_input_tokens_seen": 24745550, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.09375, "step": 1159, "time_per_iteration": 3.928196668624878 }, { "auxiliary_loss_clip": 0.01149889, "auxiliary_loss_mlp": 0.01054699, "balance_loss_clip": 1.02370465, "balance_loss_mlp": 1.0370692, "epoch": 0.06974297309484444, "flos": 20697315327360.0, "grad_norm": 2.9024267789836666, "language_loss": 0.80438364, "learning_rate": 3.952334986079957e-06, "loss": 0.82642949, "num_input_tokens_seen": 24762575, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.125, "step": 1160, "time_per_iteration": 2.3827168941497803 }, { "auxiliary_loss_clip": 0.01143649, "auxiliary_loss_mlp": 0.01054615, "balance_loss_clip": 1.02514696, "balance_loss_mlp": 1.03302932, "epoch": 0.0698030963475124, "flos": 26503999632000.0, "grad_norm": 1.6930892433628664, "language_loss": 0.756661, "learning_rate": 3.9522529568120635e-06, "loss": 0.77864367, "num_input_tokens_seen": 24782605, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.109375, "step": 1161, "time_per_iteration": 2.4427478313446045 }, { "auxiliary_loss_clip": 0.01140831, "auxiliary_loss_mlp": 0.01054918, "balance_loss_clip": 1.02531838, "balance_loss_mlp": 1.03243947, "epoch": 0.06986321960018037, "flos": 23039288795520.0, "grad_norm": 1.789627523228887, "language_loss": 0.82873094, "learning_rate": 3.952170857873283e-06, "loss": 0.85068834, "num_input_tokens_seen": 24802910, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.0859375, "step": 1162, "time_per_iteration": 5.221744537353516 }, { "auxiliary_loss_clip": 0.01137981, "auxiliary_loss_mlp": 0.01047741, "balance_loss_clip": 1.01998901, "balance_loss_mlp": 1.03151393, "epoch": 0.06992334285284833, "flos": 28583566254720.0, "grad_norm": 2.1630598390116518, "language_loss": 0.78933895, "learning_rate": 3.952088689266547e-06, "loss": 0.81119615, "num_input_tokens_seen": 24823305, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.0625, "step": 1163, "time_per_iteration": 2.46140193939209 }, { "auxiliary_loss_clip": 0.01143545, "auxiliary_loss_mlp": 0.01054695, "balance_loss_clip": 1.02511919, "balance_loss_mlp": 1.03399253, "epoch": 0.06998346610551631, "flos": 20594286305280.0, "grad_norm": 2.0283146772607057, "language_loss": 0.79181325, "learning_rate": 3.952006450994786e-06, "loss": 0.81379569, "num_input_tokens_seen": 24842155, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.09375, "step": 1164, "time_per_iteration": 3.7580206394195557 }, { "auxiliary_loss_clip": 0.01143306, "auxiliary_loss_mlp": 0.01055278, "balance_loss_clip": 1.02756214, "balance_loss_mlp": 1.0347116, "epoch": 0.07004358935818428, "flos": 22527706510080.0, "grad_norm": 1.5771235008123332, "language_loss": 0.72730517, "learning_rate": 3.951924143060937e-06, "loss": 0.74929094, "num_input_tokens_seen": 24862080, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.0859375, "step": 1165, "time_per_iteration": 2.418278694152832 }, { "auxiliary_loss_clip": 0.01142289, "auxiliary_loss_mlp": 0.01051947, "balance_loss_clip": 1.02405202, "balance_loss_mlp": 1.03415227, "epoch": 0.07010371261085224, "flos": 28948722831360.0, "grad_norm": 2.5634817613944993, "language_loss": 0.80783445, "learning_rate": 3.951841765467935e-06, "loss": 0.82977676, "num_input_tokens_seen": 24886165, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.078125, "step": 1166, "time_per_iteration": 2.481705904006958 }, { "auxiliary_loss_clip": 0.0113909, "auxiliary_loss_mlp": 0.01044312, "balance_loss_clip": 1.01615429, "balance_loss_mlp": 1.03347373, "epoch": 0.07016383586352022, "flos": 23658054554880.0, "grad_norm": 2.060449597299873, "language_loss": 0.84201783, "learning_rate": 3.951759318218722e-06, "loss": 0.86385179, "num_input_tokens_seen": 24905775, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.0546875, "step": 1167, "time_per_iteration": 2.4210996627807617 }, { "auxiliary_loss_clip": 0.01144835, "auxiliary_loss_mlp": 0.01052096, "balance_loss_clip": 1.02310443, "balance_loss_mlp": 1.03402662, "epoch": 0.07022395911618819, "flos": 19791109411200.0, "grad_norm": 2.2660493238016324, "language_loss": 0.89404839, "learning_rate": 3.951676801316239e-06, "loss": 0.91601771, "num_input_tokens_seen": 24924295, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.109375, "step": 1168, "time_per_iteration": 2.4255263805389404 }, { "auxiliary_loss_clip": 0.01145153, "auxiliary_loss_mlp": 0.01062315, "balance_loss_clip": 1.03074884, "balance_loss_mlp": 1.03228617, "epoch": 0.07028408236885615, "flos": 21688080289920.0, "grad_norm": 6.326228126873958, "language_loss": 0.88479823, "learning_rate": 3.951594214763431e-06, "loss": 0.90687293, "num_input_tokens_seen": 24943210, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 1.125, "step": 1169, "time_per_iteration": 2.4024498462677 }, { "auxiliary_loss_clip": 0.01145541, "auxiliary_loss_mlp": 0.01061234, "balance_loss_clip": 1.03035831, "balance_loss_mlp": 1.03636777, "epoch": 0.07034420562152413, "flos": 25629076160640.0, "grad_norm": 3.762141607568952, "language_loss": 0.83485639, "learning_rate": 3.951511558563246e-06, "loss": 0.85692418, "num_input_tokens_seen": 24960360, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.09375, "step": 1170, "time_per_iteration": 2.454987049102783 }, { "auxiliary_loss_clip": 0.01142319, "auxiliary_loss_mlp": 0.01058716, "balance_loss_clip": 1.02900887, "balance_loss_mlp": 1.03398812, "epoch": 0.0704043288741921, "flos": 20809491125760.0, "grad_norm": 2.0649427383890804, "language_loss": 0.75835848, "learning_rate": 3.951428832718633e-06, "loss": 0.7803688, "num_input_tokens_seen": 24978290, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.078125, "step": 1171, "time_per_iteration": 2.4012880325317383 }, { "auxiliary_loss_clip": 0.01142165, "auxiliary_loss_mlp": 0.0104635, "balance_loss_clip": 1.01909852, "balance_loss_mlp": 1.03408408, "epoch": 0.07046445212686006, "flos": 25591998430080.0, "grad_norm": 1.8672122014430101, "language_loss": 0.88891184, "learning_rate": 3.951346037232546e-06, "loss": 0.91079688, "num_input_tokens_seen": 24997055, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.078125, "step": 1172, "time_per_iteration": 2.453601121902466 }, { "auxiliary_loss_clip": 0.01143036, "auxiliary_loss_mlp": 0.01048481, "balance_loss_clip": 1.01798701, "balance_loss_mlp": 1.03233933, "epoch": 0.07052457537952803, "flos": 25555793483520.0, "grad_norm": 2.2028699019088385, "language_loss": 0.82122999, "learning_rate": 3.951263172107937e-06, "loss": 0.84314519, "num_input_tokens_seen": 25017490, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.109375, "step": 1173, "time_per_iteration": 2.4472427368164062 }, { "auxiliary_loss_clip": 0.0114247, "auxiliary_loss_mlp": 0.01051073, "balance_loss_clip": 1.02131832, "balance_loss_mlp": 1.03417015, "epoch": 0.070584698632196, "flos": 17967525943680.0, "grad_norm": 56.39445814219136, "language_loss": 0.8231191, "learning_rate": 3.951180237347765e-06, "loss": 0.84505451, "num_input_tokens_seen": 25035660, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.078125, "step": 1174, "time_per_iteration": 2.4031693935394287 }, { "auxiliary_loss_clip": 0.01142534, "auxiliary_loss_mlp": 0.01051656, "balance_loss_clip": 1.02391613, "balance_loss_mlp": 1.03413618, "epoch": 0.07064482188486397, "flos": 25369811337600.0, "grad_norm": 2.066140012195251, "language_loss": 0.85233241, "learning_rate": 3.951097232954989e-06, "loss": 0.87427437, "num_input_tokens_seen": 25054785, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.078125, "step": 1175, "time_per_iteration": 2.438138484954834 }, { "auxiliary_loss_clip": 0.01145033, "auxiliary_loss_mlp": 0.01059015, "balance_loss_clip": 1.02916503, "balance_loss_mlp": 1.03509188, "epoch": 0.07070494513753194, "flos": 24898693184640.0, "grad_norm": 1.9238158226412332, "language_loss": 0.83100969, "learning_rate": 3.951014158932572e-06, "loss": 0.85305011, "num_input_tokens_seen": 25075180, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.1015625, "step": 1176, "time_per_iteration": 2.4777917861938477 }, { "auxiliary_loss_clip": 0.01142268, "auxiliary_loss_mlp": 0.01058178, "balance_loss_clip": 1.02932918, "balance_loss_mlp": 1.03444862, "epoch": 0.07076506839019991, "flos": 22337569912320.0, "grad_norm": 4.217455463468714, "language_loss": 0.74490559, "learning_rate": 3.950931015283479e-06, "loss": 0.76691002, "num_input_tokens_seen": 25093035, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.078125, "step": 1177, "time_per_iteration": 2.4201037883758545 }, { "auxiliary_loss_clip": 0.01147636, "auxiliary_loss_mlp": 0.01051709, "balance_loss_clip": 1.0224309, "balance_loss_mlp": 1.03535318, "epoch": 0.07082519164286788, "flos": 18659818759680.0, "grad_norm": 2.1383008413969153, "language_loss": 0.86319709, "learning_rate": 3.950847802010675e-06, "loss": 0.88519061, "num_input_tokens_seen": 25112520, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.125, "step": 1178, "time_per_iteration": 2.4174270629882812 }, { "auxiliary_loss_clip": 0.01068847, "auxiliary_loss_mlp": 0.01008342, "balance_loss_clip": 1.00047445, "balance_loss_mlp": 1.01997209, "epoch": 0.07088531489553584, "flos": 63650676942720.0, "grad_norm": 0.8365517648153916, "language_loss": 0.63280094, "learning_rate": 3.950764519117132e-06, "loss": 0.65357292, "num_input_tokens_seen": 25177760, "router_z_loss_clip": 0.07861328, "router_z_loss_mlp": 0.48828125, "step": 1179, "time_per_iteration": 3.1840317249298096 }, { "auxiliary_loss_clip": 0.01145276, "auxiliary_loss_mlp": 0.0105284, "balance_loss_clip": 1.02395582, "balance_loss_mlp": 1.0354408, "epoch": 0.07094543814820382, "flos": 21571819862400.0, "grad_norm": 2.5128410300265416, "language_loss": 0.83514106, "learning_rate": 3.9506811666058215e-06, "loss": 0.85712224, "num_input_tokens_seen": 25195260, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.1015625, "step": 1180, "time_per_iteration": 2.446951389312744 }, { "auxiliary_loss_clip": 0.011423, "auxiliary_loss_mlp": 0.01053974, "balance_loss_clip": 1.02662778, "balance_loss_mlp": 1.03505707, "epoch": 0.07100556140087179, "flos": 22088883231360.0, "grad_norm": 2.205994704076047, "language_loss": 0.87598801, "learning_rate": 3.950597744479717e-06, "loss": 0.89795077, "num_input_tokens_seen": 25212740, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.0703125, "step": 1181, "time_per_iteration": 2.4032480716705322 }, { "auxiliary_loss_clip": 0.0114354, "auxiliary_loss_mlp": 0.01055523, "balance_loss_clip": 1.02886796, "balance_loss_mlp": 1.03676593, "epoch": 0.07106568465353975, "flos": 47920491388800.0, "grad_norm": 2.04683450933953, "language_loss": 0.83846635, "learning_rate": 3.950514252741797e-06, "loss": 0.86045694, "num_input_tokens_seen": 25236420, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.0703125, "step": 1182, "time_per_iteration": 2.7398793697357178 }, { "auxiliary_loss_clip": 0.01141069, "auxiliary_loss_mlp": 0.01048225, "balance_loss_clip": 1.01976967, "balance_loss_mlp": 1.03627372, "epoch": 0.07112580790620772, "flos": 23439672800640.0, "grad_norm": 3.851505484815403, "language_loss": 0.7913717, "learning_rate": 3.950430691395042e-06, "loss": 0.81326461, "num_input_tokens_seen": 25255120, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.046875, "step": 1183, "time_per_iteration": 2.4907233715057373 }, { "auxiliary_loss_clip": 0.01145502, "auxiliary_loss_mlp": 0.01055042, "balance_loss_clip": 1.02502549, "balance_loss_mlp": 1.03338456, "epoch": 0.0711859311588757, "flos": 31867531649280.0, "grad_norm": 2.0698299749698843, "language_loss": 0.78832853, "learning_rate": 3.95034706044243e-06, "loss": 0.81033391, "num_input_tokens_seen": 25275150, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.125, "step": 1184, "time_per_iteration": 2.532350540161133 }, { "auxiliary_loss_clip": 0.01141494, "auxiliary_loss_mlp": 0.01055609, "balance_loss_clip": 1.0272969, "balance_loss_mlp": 1.03506601, "epoch": 0.07124605441154366, "flos": 19609281717120.0, "grad_norm": 1.9766682763801302, "language_loss": 0.76702213, "learning_rate": 3.95026335988695e-06, "loss": 0.78899324, "num_input_tokens_seen": 25293680, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.0625, "step": 1185, "time_per_iteration": 2.4000062942504883 }, { "auxiliary_loss_clip": 0.0114118, "auxiliary_loss_mlp": 0.01052528, "balance_loss_clip": 1.02598047, "balance_loss_mlp": 1.03560448, "epoch": 0.07130617766421163, "flos": 14683560549120.0, "grad_norm": 2.3640395760624795, "language_loss": 0.65478528, "learning_rate": 3.950179589731587e-06, "loss": 0.67672229, "num_input_tokens_seen": 25310050, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.0546875, "step": 1186, "time_per_iteration": 2.405527353286743 }, { "auxiliary_loss_clip": 0.01141241, "auxiliary_loss_mlp": 0.01048348, "balance_loss_clip": 1.02125204, "balance_loss_mlp": 1.0361383, "epoch": 0.07136630091687961, "flos": 26066712453120.0, "grad_norm": 1.8983866206856574, "language_loss": 0.69567817, "learning_rate": 3.950095749979331e-06, "loss": 0.717574, "num_input_tokens_seen": 25331020, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.046875, "step": 1187, "time_per_iteration": 2.451582670211792 }, { "auxiliary_loss_clip": 0.01142274, "auxiliary_loss_mlp": 0.01049525, "balance_loss_clip": 1.02415752, "balance_loss_mlp": 1.03641868, "epoch": 0.07142642416954757, "flos": 15668285846400.0, "grad_norm": 2.595109808513564, "language_loss": 0.79029095, "learning_rate": 3.950011840633174e-06, "loss": 0.81220895, "num_input_tokens_seen": 25347875, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.0625, "step": 1188, "time_per_iteration": 2.4079363346099854 }, { "auxiliary_loss_clip": 0.01143543, "auxiliary_loss_mlp": 0.01049544, "balance_loss_clip": 1.02231669, "balance_loss_mlp": 1.03669262, "epoch": 0.07148654742221554, "flos": 19754310971520.0, "grad_norm": 1.9246330997835532, "language_loss": 0.84834594, "learning_rate": 3.9499278616961106e-06, "loss": 0.87027681, "num_input_tokens_seen": 25366715, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.0703125, "step": 1189, "time_per_iteration": 2.4651424884796143 }, { "auxiliary_loss_clip": 0.01140024, "auxiliary_loss_mlp": 0.01049078, "balance_loss_clip": 1.02243471, "balance_loss_mlp": 1.03461695, "epoch": 0.07154667067488352, "flos": 23470850511360.0, "grad_norm": 1.8648074508764025, "language_loss": 0.76680577, "learning_rate": 3.949843813171137e-06, "loss": 0.78869677, "num_input_tokens_seen": 25385450, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 1.0546875, "step": 1190, "time_per_iteration": 2.448935031890869 }, { "auxiliary_loss_clip": 0.01145334, "auxiliary_loss_mlp": 0.01063064, "balance_loss_clip": 1.03400064, "balance_loss_mlp": 1.03551149, "epoch": 0.07160679392755148, "flos": 18331949381760.0, "grad_norm": 2.0180827544920383, "language_loss": 0.75543731, "learning_rate": 3.949759695061254e-06, "loss": 0.77752125, "num_input_tokens_seen": 25403940, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.09375, "step": 1191, "time_per_iteration": 2.420780897140503 }, { "auxiliary_loss_clip": 0.01143815, "auxiliary_loss_mlp": 0.01053508, "balance_loss_clip": 1.02427769, "balance_loss_mlp": 1.03578496, "epoch": 0.07166691718021945, "flos": 17746106901120.0, "grad_norm": 3.7572314550321306, "language_loss": 0.74226058, "learning_rate": 3.949675507369463e-06, "loss": 0.76423383, "num_input_tokens_seen": 25420410, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.078125, "step": 1192, "time_per_iteration": 2.41892409324646 }, { "auxiliary_loss_clip": 0.01139894, "auxiliary_loss_mlp": 0.0104925, "balance_loss_clip": 1.02218974, "balance_loss_mlp": 1.03323972, "epoch": 0.07172704043288743, "flos": 22450932696960.0, "grad_norm": 2.174877482137923, "language_loss": 0.78133452, "learning_rate": 3.949591250098768e-06, "loss": 0.80322599, "num_input_tokens_seen": 25439415, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.0703125, "step": 1193, "time_per_iteration": 2.4201836585998535 }, { "auxiliary_loss_clip": 0.01145021, "auxiliary_loss_mlp": 0.01050061, "balance_loss_clip": 1.02161729, "balance_loss_mlp": 1.03653884, "epoch": 0.07178716368555539, "flos": 23221081578240.0, "grad_norm": 2.173482196864183, "language_loss": 0.85534096, "learning_rate": 3.949506923252175e-06, "loss": 0.87729174, "num_input_tokens_seen": 25458715, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.0859375, "step": 1194, "time_per_iteration": 2.473048448562622 }, { "auxiliary_loss_clip": 0.01141008, "auxiliary_loss_mlp": 0.01053987, "balance_loss_clip": 1.02797532, "balance_loss_mlp": 1.03503466, "epoch": 0.07184728693822336, "flos": 25149788749440.0, "grad_norm": 2.39331448397616, "language_loss": 0.81294763, "learning_rate": 3.9494225268326965e-06, "loss": 0.83489752, "num_input_tokens_seen": 25477985, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.0625, "step": 1195, "time_per_iteration": 2.423985719680786 }, { "auxiliary_loss_clip": 0.01142658, "auxiliary_loss_mlp": 0.01044567, "balance_loss_clip": 1.01861489, "balance_loss_mlp": 1.03682089, "epoch": 0.07190741019089132, "flos": 22710127697280.0, "grad_norm": 1.9294649589775585, "language_loss": 0.7980628, "learning_rate": 3.949338060843342e-06, "loss": 0.81993502, "num_input_tokens_seen": 25497110, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.0625, "step": 1196, "time_per_iteration": 2.4577596187591553 }, { "auxiliary_loss_clip": 0.01137897, "auxiliary_loss_mlp": 0.01062488, "balance_loss_clip": 1.03280461, "balance_loss_mlp": 1.03426504, "epoch": 0.0719675334435593, "flos": 29348548254720.0, "grad_norm": 3.0521239106417553, "language_loss": 0.70851308, "learning_rate": 3.949253525287126e-06, "loss": 0.73051691, "num_input_tokens_seen": 25516555, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.0390625, "step": 1197, "time_per_iteration": 2.4613070487976074 }, { "auxiliary_loss_clip": 0.01138148, "auxiliary_loss_mlp": 0.01048628, "balance_loss_clip": 1.02221167, "balance_loss_mlp": 1.03309786, "epoch": 0.07202765669622727, "flos": 17638818693120.0, "grad_norm": 4.029424760912505, "language_loss": 0.85489368, "learning_rate": 3.9491689201670655e-06, "loss": 0.8767615, "num_input_tokens_seen": 25533895, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.046875, "step": 1198, "time_per_iteration": 2.4109957218170166 }, { "auxiliary_loss_clip": 0.01142444, "auxiliary_loss_mlp": 0.01053808, "balance_loss_clip": 1.02474451, "balance_loss_mlp": 1.03578997, "epoch": 0.07208777994889523, "flos": 21432969918720.0, "grad_norm": 2.2387513912187056, "language_loss": 0.83341557, "learning_rate": 3.94908424548618e-06, "loss": 0.85537809, "num_input_tokens_seen": 25554195, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.0703125, "step": 1199, "time_per_iteration": 3.895958185195923 }, { "auxiliary_loss_clip": 0.01146576, "auxiliary_loss_mlp": 0.01055465, "balance_loss_clip": 1.02858305, "balance_loss_mlp": 1.03858709, "epoch": 0.07214790320156321, "flos": 26939715799680.0, "grad_norm": 2.230486717462451, "language_loss": 0.75693011, "learning_rate": 3.9489995012474924e-06, "loss": 0.77895045, "num_input_tokens_seen": 25574155, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.078125, "step": 1200, "time_per_iteration": 2.5096075534820557 }, { "auxiliary_loss_clip": 0.0114182, "auxiliary_loss_mlp": 0.01056701, "balance_loss_clip": 1.02809119, "balance_loss_mlp": 1.03691626, "epoch": 0.07220802645423118, "flos": 23878775370240.0, "grad_norm": 2.228077108250191, "language_loss": 0.8275224, "learning_rate": 3.948914687454027e-06, "loss": 0.84950757, "num_input_tokens_seen": 25592735, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.046875, "step": 1201, "time_per_iteration": 2.4473869800567627 }, { "auxiliary_loss_clip": 0.01143448, "auxiliary_loss_mlp": 0.01054304, "balance_loss_clip": 1.02470398, "balance_loss_mlp": 1.03431582, "epoch": 0.07226814970689914, "flos": 19242658863360.0, "grad_norm": 2.3020796644932813, "language_loss": 0.68767619, "learning_rate": 3.948829804108807e-06, "loss": 0.70965374, "num_input_tokens_seen": 25611510, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.09375, "step": 1202, "time_per_iteration": 5.258546590805054 }, { "auxiliary_loss_clip": 0.01142037, "auxiliary_loss_mlp": 0.01052944, "balance_loss_clip": 1.02426255, "balance_loss_mlp": 1.03484988, "epoch": 0.07232827295956712, "flos": 19171017020160.0, "grad_norm": 2.6378065975950515, "language_loss": 0.87662745, "learning_rate": 3.948744851214865e-06, "loss": 0.89857721, "num_input_tokens_seen": 25629560, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.078125, "step": 1203, "time_per_iteration": 3.90230393409729 }, { "auxiliary_loss_clip": 0.01145265, "auxiliary_loss_mlp": 0.01051896, "balance_loss_clip": 1.02352452, "balance_loss_mlp": 1.03499961, "epoch": 0.07238839621223508, "flos": 17638783781760.0, "grad_norm": 1.9076009635896547, "language_loss": 0.78297997, "learning_rate": 3.948659828775233e-06, "loss": 0.80495155, "num_input_tokens_seen": 25648330, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.109375, "step": 1204, "time_per_iteration": 2.4286630153656006 }, { "auxiliary_loss_clip": 0.01141302, "auxiliary_loss_mlp": 0.01051983, "balance_loss_clip": 1.02534008, "balance_loss_mlp": 1.0341996, "epoch": 0.07244851946490305, "flos": 28291168684800.0, "grad_norm": 1.6565434766687437, "language_loss": 0.82000256, "learning_rate": 3.9485747367929436e-06, "loss": 0.8419354, "num_input_tokens_seen": 25669470, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.0703125, "step": 1205, "time_per_iteration": 2.4822306632995605 }, { "auxiliary_loss_clip": 0.01144075, "auxiliary_loss_mlp": 0.01054803, "balance_loss_clip": 1.02466643, "balance_loss_mlp": 1.03717446, "epoch": 0.07250864271757101, "flos": 22563736899840.0, "grad_norm": 2.0059740658311545, "language_loss": 0.7660293, "learning_rate": 3.948489575271035e-06, "loss": 0.78801811, "num_input_tokens_seen": 25690470, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.0625, "step": 1206, "time_per_iteration": 2.473125457763672 }, { "auxiliary_loss_clip": 0.01143495, "auxiliary_loss_mlp": 0.01050002, "balance_loss_clip": 1.02232158, "balance_loss_mlp": 1.03645301, "epoch": 0.072568765970239, "flos": 21761328055680.0, "grad_norm": 2.467119437823108, "language_loss": 0.77418441, "learning_rate": 3.948404344212544e-06, "loss": 0.79611939, "num_input_tokens_seen": 25709205, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.0703125, "step": 1207, "time_per_iteration": 2.404226779937744 }, { "auxiliary_loss_clip": 0.01140906, "auxiliary_loss_mlp": 0.01048589, "balance_loss_clip": 1.02285171, "balance_loss_mlp": 1.03607512, "epoch": 0.07262888922290696, "flos": 25518541196160.0, "grad_norm": 2.4181522111205536, "language_loss": 0.79696399, "learning_rate": 3.948319043620516e-06, "loss": 0.81885892, "num_input_tokens_seen": 25728485, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.046875, "step": 1208, "time_per_iteration": 2.4818239212036133 }, { "auxiliary_loss_clip": 0.01141341, "auxiliary_loss_mlp": 0.01045132, "balance_loss_clip": 1.01904941, "balance_loss_mlp": 1.03661633, "epoch": 0.07268901247557492, "flos": 21245626229760.0, "grad_norm": 2.396122437366724, "language_loss": 0.78514445, "learning_rate": 3.948233673497991e-06, "loss": 0.80700922, "num_input_tokens_seen": 25747730, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.046875, "step": 1209, "time_per_iteration": 2.4050676822662354 }, { "auxiliary_loss_clip": 0.0114268, "auxiliary_loss_mlp": 0.01055349, "balance_loss_clip": 1.0285393, "balance_loss_mlp": 1.03766227, "epoch": 0.0727491357282429, "flos": 25478251620480.0, "grad_norm": 2.593857403459947, "language_loss": 0.8194046, "learning_rate": 3.948148233848018e-06, "loss": 0.84138495, "num_input_tokens_seen": 25768050, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 1.0546875, "step": 1210, "time_per_iteration": 2.4727799892425537 }, { "auxiliary_loss_clip": 0.01140298, "auxiliary_loss_mlp": 0.01054591, "balance_loss_clip": 1.02724493, "balance_loss_mlp": 1.03657985, "epoch": 0.07280925898091087, "flos": 24461021980800.0, "grad_norm": 1.7600465661830798, "language_loss": 0.84463573, "learning_rate": 3.948062724673646e-06, "loss": 0.8665846, "num_input_tokens_seen": 25787985, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.0390625, "step": 1211, "time_per_iteration": 2.4353771209716797 }, { "auxiliary_loss_clip": 0.01137977, "auxiliary_loss_mlp": 0.01042181, "balance_loss_clip": 1.01647985, "balance_loss_mlp": 1.03354049, "epoch": 0.07286938223357883, "flos": 18287435531520.0, "grad_norm": 2.3416196011709323, "language_loss": 0.90241849, "learning_rate": 3.947977145977927e-06, "loss": 0.92422009, "num_input_tokens_seen": 25803620, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.046875, "step": 1212, "time_per_iteration": 2.371009111404419 }, { "auxiliary_loss_clip": 0.0114141, "auxiliary_loss_mlp": 0.01046974, "balance_loss_clip": 1.01823235, "balance_loss_mlp": 1.03664112, "epoch": 0.07292950548624681, "flos": 21213750291840.0, "grad_norm": 2.0401905048381983, "language_loss": 0.72653985, "learning_rate": 3.947891497763914e-06, "loss": 0.7484237, "num_input_tokens_seen": 25823315, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.046875, "step": 1213, "time_per_iteration": 2.4208662509918213 }, { "auxiliary_loss_clip": 0.01143135, "auxiliary_loss_mlp": 0.01044519, "balance_loss_clip": 1.01767325, "balance_loss_mlp": 1.03473377, "epoch": 0.07298962873891478, "flos": 24640929550080.0, "grad_norm": 1.7914246045079538, "language_loss": 0.84198576, "learning_rate": 3.947805780034664e-06, "loss": 0.86386228, "num_input_tokens_seen": 25842605, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.0859375, "step": 1214, "time_per_iteration": 2.448817253112793 }, { "auxiliary_loss_clip": 0.01146305, "auxiliary_loss_mlp": 0.01049021, "balance_loss_clip": 1.021662, "balance_loss_mlp": 1.03676653, "epoch": 0.07304975199158274, "flos": 27051542484480.0, "grad_norm": 2.7801557012053837, "language_loss": 0.84115255, "learning_rate": 3.947719992793236e-06, "loss": 0.86310577, "num_input_tokens_seen": 25863030, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.09375, "step": 1215, "time_per_iteration": 2.4775185585021973 }, { "auxiliary_loss_clip": 0.01141298, "auxiliary_loss_mlp": 0.01048689, "balance_loss_clip": 1.0214498, "balance_loss_mlp": 1.03517699, "epoch": 0.07310987524425071, "flos": 33548075809920.0, "grad_norm": 2.0020276426992583, "language_loss": 0.80888009, "learning_rate": 3.9476341360426924e-06, "loss": 0.83077991, "num_input_tokens_seen": 25888015, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.0625, "step": 1216, "time_per_iteration": 2.6004252433776855 }, { "auxiliary_loss_clip": 0.01145917, "auxiliary_loss_mlp": 0.01044281, "balance_loss_clip": 1.01780462, "balance_loss_mlp": 1.0389626, "epoch": 0.07316999849691869, "flos": 28109690104320.0, "grad_norm": 2.2619077943459716, "language_loss": 0.76420432, "learning_rate": 3.9475482097860955e-06, "loss": 0.78610629, "num_input_tokens_seen": 25908660, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.0703125, "step": 1217, "time_per_iteration": 2.4730021953582764 }, { "auxiliary_loss_clip": 0.0113805, "auxiliary_loss_mlp": 0.01051466, "balance_loss_clip": 1.02534699, "balance_loss_mlp": 1.03639913, "epoch": 0.07323012174958665, "flos": 14391721560960.0, "grad_norm": 2.01205891769929, "language_loss": 0.86498004, "learning_rate": 3.947462214026512e-06, "loss": 0.88687515, "num_input_tokens_seen": 25927215, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.015625, "step": 1218, "time_per_iteration": 2.4093971252441406 }, { "auxiliary_loss_clip": 0.01141823, "auxiliary_loss_mlp": 0.01057249, "balance_loss_clip": 1.03045118, "balance_loss_mlp": 1.03431439, "epoch": 0.07329024500225462, "flos": 21615356194560.0, "grad_norm": 1.733751255105255, "language_loss": 0.86781025, "learning_rate": 3.947376148767013e-06, "loss": 0.88980097, "num_input_tokens_seen": 25945500, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 1.078125, "step": 1219, "time_per_iteration": 2.4079973697662354 }, { "auxiliary_loss_clip": 0.01139537, "auxiliary_loss_mlp": 0.0105074, "balance_loss_clip": 1.02428699, "balance_loss_mlp": 1.03407681, "epoch": 0.0733503682549226, "flos": 13223318267520.0, "grad_norm": 2.5594197446101523, "language_loss": 0.84308958, "learning_rate": 3.947290014010668e-06, "loss": 0.86499238, "num_input_tokens_seen": 25963105, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.0546875, "step": 1220, "time_per_iteration": 2.3945207595825195 }, { "auxiliary_loss_clip": 0.01140314, "auxiliary_loss_mlp": 0.01054931, "balance_loss_clip": 1.02772701, "balance_loss_mlp": 1.034724, "epoch": 0.07341049150759056, "flos": 20885915825280.0, "grad_norm": 4.876721254970153, "language_loss": 0.76878929, "learning_rate": 3.9472038097605516e-06, "loss": 0.7907418, "num_input_tokens_seen": 25981690, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.0546875, "step": 1221, "time_per_iteration": 2.4245474338531494 }, { "auxiliary_loss_clip": 0.01143765, "auxiliary_loss_mlp": 0.01053598, "balance_loss_clip": 1.02473783, "balance_loss_mlp": 1.03804398, "epoch": 0.07347061476025853, "flos": 15412721627520.0, "grad_norm": 2.9206585572953103, "language_loss": 0.91950142, "learning_rate": 3.94711753601974e-06, "loss": 0.94147503, "num_input_tokens_seen": 25999890, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.0546875, "step": 1222, "time_per_iteration": 2.3849406242370605 }, { "auxiliary_loss_clip": 0.01148281, "auxiliary_loss_mlp": 0.01049477, "balance_loss_clip": 1.0222857, "balance_loss_mlp": 1.03966355, "epoch": 0.0735307380129265, "flos": 11108070368640.0, "grad_norm": 2.3887842126224474, "language_loss": 0.90904081, "learning_rate": 3.947031192791312e-06, "loss": 0.93101841, "num_input_tokens_seen": 26016445, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.0859375, "step": 1223, "time_per_iteration": 2.439429759979248 }, { "auxiliary_loss_clip": 0.01142685, "auxiliary_loss_mlp": 0.01054166, "balance_loss_clip": 1.02712965, "balance_loss_mlp": 1.03838503, "epoch": 0.07359086126559447, "flos": 23731267409280.0, "grad_norm": 2.124319472946957, "language_loss": 0.81972909, "learning_rate": 3.9469447800783485e-06, "loss": 0.84169757, "num_input_tokens_seen": 26036080, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.046875, "step": 1224, "time_per_iteration": 2.4450085163116455 }, { "auxiliary_loss_clip": 0.01140637, "auxiliary_loss_mlp": 0.01059567, "balance_loss_clip": 1.03000343, "balance_loss_mlp": 1.03389513, "epoch": 0.07365098451826244, "flos": 20992296337920.0, "grad_norm": 2.4175849250634416, "language_loss": 0.83184677, "learning_rate": 3.946858297883935e-06, "loss": 0.85384881, "num_input_tokens_seen": 26055805, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.0625, "step": 1225, "time_per_iteration": 2.4287068843841553 }, { "auxiliary_loss_clip": 0.01141, "auxiliary_loss_mlp": 0.01049981, "balance_loss_clip": 1.02364779, "balance_loss_mlp": 1.03529191, "epoch": 0.0737111077709304, "flos": 19932682440960.0, "grad_norm": 2.011532145546205, "language_loss": 0.90203059, "learning_rate": 3.946771746211156e-06, "loss": 0.9239403, "num_input_tokens_seen": 26073905, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.0546875, "step": 1226, "time_per_iteration": 2.3896918296813965 }, { "auxiliary_loss_clip": 0.01149355, "auxiliary_loss_mlp": 0.01046833, "balance_loss_clip": 1.01710188, "balance_loss_mlp": 1.03556871, "epoch": 0.07377123102359838, "flos": 16580601250560.0, "grad_norm": 2.5947243267518085, "language_loss": 0.76146984, "learning_rate": 3.946685125063101e-06, "loss": 0.78343177, "num_input_tokens_seen": 26091700, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.140625, "step": 1227, "time_per_iteration": 2.448315143585205 }, { "auxiliary_loss_clip": 0.01141519, "auxiliary_loss_mlp": 0.01054329, "balance_loss_clip": 1.02749527, "balance_loss_mlp": 1.03901958, "epoch": 0.07383135427626634, "flos": 28327338720000.0, "grad_norm": 1.6501807133980002, "language_loss": 0.85381699, "learning_rate": 3.9465984344428615e-06, "loss": 0.87577546, "num_input_tokens_seen": 26114105, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.0234375, "step": 1228, "time_per_iteration": 2.4647083282470703 }, { "auxiliary_loss_clip": 0.01144267, "auxiliary_loss_mlp": 0.0104705, "balance_loss_clip": 1.01946509, "balance_loss_mlp": 1.03646779, "epoch": 0.07389147752893431, "flos": 20046149959680.0, "grad_norm": 2.6306386628314633, "language_loss": 0.79686767, "learning_rate": 3.946511674353531e-06, "loss": 0.81878078, "num_input_tokens_seen": 26131165, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.078125, "step": 1229, "time_per_iteration": 2.4147233963012695 }, { "auxiliary_loss_clip": 0.01143158, "auxiliary_loss_mlp": 0.01053877, "balance_loss_clip": 1.0235498, "balance_loss_mlp": 1.03548288, "epoch": 0.07395160078160229, "flos": 18113148691200.0, "grad_norm": 2.6013210374683204, "language_loss": 0.78361106, "learning_rate": 3.9464248447982065e-06, "loss": 0.80558145, "num_input_tokens_seen": 26150040, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.078125, "step": 1230, "time_per_iteration": 2.377610445022583 }, { "auxiliary_loss_clip": 0.01139386, "auxiliary_loss_mlp": 0.01046705, "balance_loss_clip": 1.01846445, "balance_loss_mlp": 1.03478765, "epoch": 0.07401172403427025, "flos": 23585784307200.0, "grad_norm": 2.1179927963574534, "language_loss": 0.81063914, "learning_rate": 3.946337945779986e-06, "loss": 0.83249998, "num_input_tokens_seen": 26169380, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.046875, "step": 1231, "time_per_iteration": 2.456991672515869 }, { "auxiliary_loss_clip": 0.01143164, "auxiliary_loss_mlp": 0.01050602, "balance_loss_clip": 1.0215987, "balance_loss_mlp": 1.03441632, "epoch": 0.07407184728693822, "flos": 26358691086720.0, "grad_norm": 2.1789172637567487, "language_loss": 0.94616294, "learning_rate": 3.94625097730197e-06, "loss": 0.96810061, "num_input_tokens_seen": 26189420, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.0859375, "step": 1232, "time_per_iteration": 2.4660682678222656 }, { "auxiliary_loss_clip": 0.01139915, "auxiliary_loss_mlp": 0.01049601, "balance_loss_clip": 1.02361298, "balance_loss_mlp": 1.03494775, "epoch": 0.0741319705396062, "flos": 22199348373120.0, "grad_norm": 1.8981820065754686, "language_loss": 0.81123012, "learning_rate": 3.946163939367264e-06, "loss": 0.83312529, "num_input_tokens_seen": 26209300, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.046875, "step": 1233, "time_per_iteration": 2.4436287879943848 }, { "auxiliary_loss_clip": 0.01146831, "auxiliary_loss_mlp": 0.01059435, "balance_loss_clip": 1.02751112, "balance_loss_mlp": 1.03701162, "epoch": 0.07419209379227416, "flos": 39198978161280.0, "grad_norm": 2.275438771641701, "language_loss": 0.70302069, "learning_rate": 3.9460768319789724e-06, "loss": 0.72508335, "num_input_tokens_seen": 26228110, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 1.1015625, "step": 1234, "time_per_iteration": 2.5459272861480713 }, { "auxiliary_loss_clip": 0.01144109, "auxiliary_loss_mlp": 0.01052466, "balance_loss_clip": 1.024261, "balance_loss_mlp": 1.03747475, "epoch": 0.07425221704494213, "flos": 22780617465600.0, "grad_norm": 1.9535843715857266, "language_loss": 0.77411473, "learning_rate": 3.945989655140205e-06, "loss": 0.79608047, "num_input_tokens_seen": 26247020, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.0625, "step": 1235, "time_per_iteration": 2.465358018875122 }, { "auxiliary_loss_clip": 0.01142052, "auxiliary_loss_mlp": 0.01047285, "balance_loss_clip": 1.0195688, "balance_loss_mlp": 1.03596783, "epoch": 0.0743123402976101, "flos": 22271897911680.0, "grad_norm": 2.1740463529522676, "language_loss": 0.8237235, "learning_rate": 3.945902408854073e-06, "loss": 0.84561688, "num_input_tokens_seen": 26265750, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.0625, "step": 1236, "time_per_iteration": 2.433403730392456 }, { "auxiliary_loss_clip": 0.01142526, "auxiliary_loss_mlp": 0.01054264, "balance_loss_clip": 1.02632093, "balance_loss_mlp": 1.03503644, "epoch": 0.07437246355027807, "flos": 29313739762560.0, "grad_norm": 2.189070570878333, "language_loss": 0.7565378, "learning_rate": 3.945815093123688e-06, "loss": 0.77850574, "num_input_tokens_seen": 26287905, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.078125, "step": 1237, "time_per_iteration": 2.515289545059204 }, { "auxiliary_loss_clip": 0.01141188, "auxiliary_loss_mlp": 0.01051467, "balance_loss_clip": 1.02457392, "balance_loss_mlp": 1.03395224, "epoch": 0.07443258680294604, "flos": 31943293032960.0, "grad_norm": 1.6945120127682058, "language_loss": 0.77806079, "learning_rate": 3.945727707952168e-06, "loss": 0.79998732, "num_input_tokens_seen": 26311795, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.0703125, "step": 1238, "time_per_iteration": 2.539829730987549 }, { "auxiliary_loss_clip": 0.01146616, "auxiliary_loss_mlp": 0.01052864, "balance_loss_clip": 1.02389622, "balance_loss_mlp": 1.03531945, "epoch": 0.074492710055614, "flos": 22674167130240.0, "grad_norm": 1.9737472469547397, "language_loss": 0.86791956, "learning_rate": 3.945640253342632e-06, "loss": 0.88991439, "num_input_tokens_seen": 26330330, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.1171875, "step": 1239, "time_per_iteration": 3.8863275051116943 }, { "auxiliary_loss_clip": 0.01143857, "auxiliary_loss_mlp": 0.01046158, "balance_loss_clip": 1.01618838, "balance_loss_mlp": 1.03571773, "epoch": 0.07455283330828198, "flos": 21283925857920.0, "grad_norm": 1.87822766520297, "language_loss": 0.88759482, "learning_rate": 3.9455527292981996e-06, "loss": 0.909495, "num_input_tokens_seen": 26348865, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.078125, "step": 1240, "time_per_iteration": 2.4443845748901367 }, { "auxiliary_loss_clip": 0.01144719, "auxiliary_loss_mlp": 0.01053837, "balance_loss_clip": 1.02510726, "balance_loss_mlp": 1.0371182, "epoch": 0.07461295656094995, "flos": 24387285456000.0, "grad_norm": 2.065054906840633, "language_loss": 0.89345175, "learning_rate": 3.945465135821995e-06, "loss": 0.91543734, "num_input_tokens_seen": 26368210, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.078125, "step": 1241, "time_per_iteration": 3.873291254043579 }, { "auxiliary_loss_clip": 0.01042863, "auxiliary_loss_mlp": 0.01006544, "balance_loss_clip": 1.00244331, "balance_loss_mlp": 1.0090704, "epoch": 0.07467307981361791, "flos": 62106608753280.0, "grad_norm": 0.889362179100009, "language_loss": 0.63114607, "learning_rate": 3.9453774729171435e-06, "loss": 0.65164018, "num_input_tokens_seen": 26424890, "router_z_loss_clip": 0.04101562, "router_z_loss_mlp": 0.33789062, "step": 1242, "time_per_iteration": 6.085329055786133 }, { "auxiliary_loss_clip": 0.01153597, "auxiliary_loss_mlp": 0.01054763, "balance_loss_clip": 1.0232439, "balance_loss_mlp": 1.03737187, "epoch": 0.07473320306628589, "flos": 24861999479040.0, "grad_norm": 2.764946515618584, "language_loss": 0.62563753, "learning_rate": 3.945289740586775e-06, "loss": 0.64772117, "num_input_tokens_seen": 26446405, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 1.1640625, "step": 1243, "time_per_iteration": 2.450406074523926 }, { "auxiliary_loss_clip": 0.0114227, "auxiliary_loss_mlp": 0.01044545, "balance_loss_clip": 1.01800931, "balance_loss_mlp": 1.03594351, "epoch": 0.07479332631895386, "flos": 24896354123520.0, "grad_norm": 1.845577187094996, "language_loss": 0.76297748, "learning_rate": 3.945201938834018e-06, "loss": 0.78484559, "num_input_tokens_seen": 26466070, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.0625, "step": 1244, "time_per_iteration": 2.448031187057495 }, { "auxiliary_loss_clip": 0.01147805, "auxiliary_loss_mlp": 0.01054192, "balance_loss_clip": 1.02586758, "balance_loss_mlp": 1.03727245, "epoch": 0.07485344957162182, "flos": 17814467076480.0, "grad_norm": 3.423545890676594, "language_loss": 0.69239521, "learning_rate": 3.945114067662009e-06, "loss": 0.71441513, "num_input_tokens_seen": 26479350, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.109375, "step": 1245, "time_per_iteration": 2.3707425594329834 }, { "auxiliary_loss_clip": 0.01144149, "auxiliary_loss_mlp": 0.01052193, "balance_loss_clip": 1.02332044, "balance_loss_mlp": 1.03658879, "epoch": 0.0749135728242898, "flos": 25009018680960.0, "grad_norm": 1.7770744627600272, "language_loss": 0.88667941, "learning_rate": 3.9450261270738815e-06, "loss": 0.90864277, "num_input_tokens_seen": 26498255, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.078125, "step": 1246, "time_per_iteration": 2.4962313175201416 }, { "auxiliary_loss_clip": 0.01152627, "auxiliary_loss_mlp": 0.01055953, "balance_loss_clip": 1.0230515, "balance_loss_mlp": 1.03740907, "epoch": 0.07497369607695777, "flos": 17821100234880.0, "grad_norm": 2.4445910492133485, "language_loss": 0.88317931, "learning_rate": 3.944938117072776e-06, "loss": 0.90526509, "num_input_tokens_seen": 26515375, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 1.15625, "step": 1247, "time_per_iteration": 2.4028384685516357 }, { "auxiliary_loss_clip": 0.01141357, "auxiliary_loss_mlp": 0.01053357, "balance_loss_clip": 1.02529478, "balance_loss_mlp": 1.03495264, "epoch": 0.07503381932962573, "flos": 15120219323520.0, "grad_norm": 2.405659657463368, "language_loss": 0.64709055, "learning_rate": 3.944850037661831e-06, "loss": 0.66903764, "num_input_tokens_seen": 26533595, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.0625, "step": 1248, "time_per_iteration": 2.442790985107422 }, { "auxiliary_loss_clip": 0.01141608, "auxiliary_loss_mlp": 0.01052677, "balance_loss_clip": 1.02704656, "balance_loss_mlp": 1.03831017, "epoch": 0.0750939425822937, "flos": 12816091635840.0, "grad_norm": 2.2582700393608524, "language_loss": 0.74438941, "learning_rate": 3.944761888844191e-06, "loss": 0.76633227, "num_input_tokens_seen": 26549405, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.03125, "step": 1249, "time_per_iteration": 2.3979647159576416 }, { "auxiliary_loss_clip": 0.01147036, "auxiliary_loss_mlp": 0.01063925, "balance_loss_clip": 1.03414643, "balance_loss_mlp": 1.0371995, "epoch": 0.07515406583496168, "flos": 24205702141440.0, "grad_norm": 3.3231514265364357, "language_loss": 0.8245669, "learning_rate": 3.944673670623001e-06, "loss": 0.84667647, "num_input_tokens_seen": 26567200, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.09375, "step": 1250, "time_per_iteration": 2.469184398651123 }, { "auxiliary_loss_clip": 0.01144078, "auxiliary_loss_mlp": 0.01053375, "balance_loss_clip": 1.02534926, "balance_loss_mlp": 1.03903031, "epoch": 0.07521418908762964, "flos": 26686944489600.0, "grad_norm": 6.605391922579396, "language_loss": 0.669029, "learning_rate": 3.944585383001411e-06, "loss": 0.69100344, "num_input_tokens_seen": 26586190, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.0546875, "step": 1251, "time_per_iteration": 2.5036938190460205 }, { "auxiliary_loss_clip": 0.01039191, "auxiliary_loss_mlp": 0.0100688, "balance_loss_clip": 1.00211179, "balance_loss_mlp": 1.00628459, "epoch": 0.0752743123402976, "flos": 59091788096640.0, "grad_norm": 0.8883458310414257, "language_loss": 0.70415509, "learning_rate": 3.944497025982571e-06, "loss": 0.72461569, "num_input_tokens_seen": 26650710, "router_z_loss_clip": 0.04760742, "router_z_loss_mlp": 0.33007812, "step": 1252, "time_per_iteration": 3.095370054244995 }, { "auxiliary_loss_clip": 0.01142421, "auxiliary_loss_mlp": 0.01061465, "balance_loss_clip": 1.03253305, "balance_loss_mlp": 1.03514695, "epoch": 0.07533443559296558, "flos": 23475912658560.0, "grad_norm": 2.1062586387472946, "language_loss": 0.79992402, "learning_rate": 3.944408599569633e-06, "loss": 0.82196289, "num_input_tokens_seen": 26669000, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.0703125, "step": 1253, "time_per_iteration": 2.4121005535125732 }, { "auxiliary_loss_clip": 0.01146671, "auxiliary_loss_mlp": 0.01057295, "balance_loss_clip": 1.0272783, "balance_loss_mlp": 1.03715527, "epoch": 0.07539455884563355, "flos": 20878270237440.0, "grad_norm": 2.9367962325269223, "language_loss": 0.9338783, "learning_rate": 3.9443201037657545e-06, "loss": 0.95591795, "num_input_tokens_seen": 26683075, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.09375, "step": 1254, "time_per_iteration": 2.419344663619995 }, { "auxiliary_loss_clip": 0.01139612, "auxiliary_loss_mlp": 0.01047563, "balance_loss_clip": 1.01966774, "balance_loss_mlp": 1.03530228, "epoch": 0.07545468209830151, "flos": 27671669786880.0, "grad_norm": 4.104633237380502, "language_loss": 0.87933367, "learning_rate": 3.944231538574092e-06, "loss": 0.90120542, "num_input_tokens_seen": 26701875, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.046875, "step": 1255, "time_per_iteration": 2.4580881595611572 }, { "auxiliary_loss_clip": 0.01140922, "auxiliary_loss_mlp": 0.01051102, "balance_loss_clip": 1.02168155, "balance_loss_mlp": 1.03676486, "epoch": 0.0755148053509695, "flos": 14136122430720.0, "grad_norm": 1.7522842390054543, "language_loss": 0.79969382, "learning_rate": 3.9441429039978086e-06, "loss": 0.82161403, "num_input_tokens_seen": 26719050, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.046875, "step": 1256, "time_per_iteration": 2.4150822162628174 }, { "auxiliary_loss_clip": 0.01040318, "auxiliary_loss_mlp": 0.01004824, "balance_loss_clip": 1.00017476, "balance_loss_mlp": 1.00681496, "epoch": 0.07557492860363746, "flos": 58232506780800.0, "grad_norm": 0.7715466436615287, "language_loss": 0.58031034, "learning_rate": 3.944054200040065e-06, "loss": 0.60076171, "num_input_tokens_seen": 26780650, "router_z_loss_clip": 0.04638672, "router_z_loss_mlp": 0.3359375, "step": 1257, "time_per_iteration": 3.169145107269287 }, { "auxiliary_loss_clip": 0.01143633, "auxiliary_loss_mlp": 0.01056884, "balance_loss_clip": 1.02752304, "balance_loss_mlp": 1.03853524, "epoch": 0.07563505185630542, "flos": 24643233699840.0, "grad_norm": 2.6403445214651766, "language_loss": 0.89664084, "learning_rate": 3.943965426704027e-06, "loss": 0.91864598, "num_input_tokens_seen": 26798725, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.0546875, "step": 1258, "time_per_iteration": 2.4530155658721924 }, { "auxiliary_loss_clip": 0.01142097, "auxiliary_loss_mlp": 0.01055937, "balance_loss_clip": 1.02931738, "balance_loss_mlp": 1.03846896, "epoch": 0.07569517510897339, "flos": 15522104517120.0, "grad_norm": 2.0561855104483153, "language_loss": 0.80861282, "learning_rate": 3.943876583992864e-06, "loss": 0.83059323, "num_input_tokens_seen": 26817005, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.03125, "step": 1259, "time_per_iteration": 2.41066837310791 }, { "auxiliary_loss_clip": 0.01141372, "auxiliary_loss_mlp": 0.01054204, "balance_loss_clip": 1.02615404, "balance_loss_mlp": 1.03635323, "epoch": 0.07575529836164137, "flos": 22927462110720.0, "grad_norm": 1.7786873652159558, "language_loss": 0.75696343, "learning_rate": 3.943787671909746e-06, "loss": 0.7789191, "num_input_tokens_seen": 26836655, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.046875, "step": 1260, "time_per_iteration": 2.4653754234313965 }, { "auxiliary_loss_clip": 0.01142378, "auxiliary_loss_mlp": 0.0105818, "balance_loss_clip": 1.0282104, "balance_loss_mlp": 1.03585958, "epoch": 0.07581542161430933, "flos": 19499410068480.0, "grad_norm": 2.2183576565645375, "language_loss": 0.84589267, "learning_rate": 3.943698690457846e-06, "loss": 0.86789823, "num_input_tokens_seen": 26854925, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.0625, "step": 1261, "time_per_iteration": 2.437059164047241 }, { "auxiliary_loss_clip": 0.0114615, "auxiliary_loss_mlp": 0.01060723, "balance_loss_clip": 1.03254223, "balance_loss_mlp": 1.03722024, "epoch": 0.0758755448669773, "flos": 24972290064000.0, "grad_norm": 1.8577573636488671, "language_loss": 0.83029902, "learning_rate": 3.943609639640339e-06, "loss": 0.85236776, "num_input_tokens_seen": 26876170, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.0859375, "step": 1262, "time_per_iteration": 2.51898455619812 }, { "auxiliary_loss_clip": 0.01139744, "auxiliary_loss_mlp": 0.01054691, "balance_loss_clip": 1.02629542, "balance_loss_mlp": 1.03495586, "epoch": 0.07593566811964528, "flos": 22746856314240.0, "grad_norm": 3.162190663494559, "language_loss": 0.82544661, "learning_rate": 3.943520519460405e-06, "loss": 0.84739101, "num_input_tokens_seen": 26895005, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.046875, "step": 1263, "time_per_iteration": 2.421804189682007 }, { "auxiliary_loss_clip": 0.01144962, "auxiliary_loss_mlp": 0.01046337, "balance_loss_clip": 1.01865637, "balance_loss_mlp": 1.03497577, "epoch": 0.07599579137231324, "flos": 23111279752320.0, "grad_norm": 2.6172494042949146, "language_loss": 0.76007628, "learning_rate": 3.943431329921221e-06, "loss": 0.78198922, "num_input_tokens_seen": 26913930, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.1015625, "step": 1264, "time_per_iteration": 2.436509847640991 }, { "auxiliary_loss_clip": 0.01143242, "auxiliary_loss_mlp": 0.0105781, "balance_loss_clip": 1.02933121, "balance_loss_mlp": 1.03629994, "epoch": 0.07605591462498121, "flos": 14501174273280.0, "grad_norm": 2.5946179933620526, "language_loss": 0.8096326, "learning_rate": 3.943342071025974e-06, "loss": 0.8316431, "num_input_tokens_seen": 26931485, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.0703125, "step": 1265, "time_per_iteration": 2.3973848819732666 }, { "auxiliary_loss_clip": 0.01144294, "auxiliary_loss_mlp": 0.01049988, "balance_loss_clip": 1.02011418, "balance_loss_mlp": 1.03644371, "epoch": 0.07611603787764919, "flos": 23513060211840.0, "grad_norm": 3.287075522816002, "language_loss": 0.65693021, "learning_rate": 3.9432527427778455e-06, "loss": 0.678873, "num_input_tokens_seen": 26951670, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.078125, "step": 1266, "time_per_iteration": 2.455113172531128 }, { "auxiliary_loss_clip": 0.0114309, "auxiliary_loss_mlp": 0.01054833, "balance_loss_clip": 1.02599609, "balance_loss_mlp": 1.03630745, "epoch": 0.07617616113031715, "flos": 21506112950400.0, "grad_norm": 2.377862188966611, "language_loss": 0.79258627, "learning_rate": 3.943163345180026e-06, "loss": 0.81456548, "num_input_tokens_seen": 26970335, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.0625, "step": 1267, "time_per_iteration": 2.5339574813842773 }, { "auxiliary_loss_clip": 0.0114187, "auxiliary_loss_mlp": 0.01043139, "balance_loss_clip": 1.01756895, "balance_loss_mlp": 1.03486967, "epoch": 0.07623628438298512, "flos": 14572327357440.0, "grad_norm": 2.5288995682024065, "language_loss": 0.72980249, "learning_rate": 3.9430738782357054e-06, "loss": 0.75165266, "num_input_tokens_seen": 26986025, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.0703125, "step": 1268, "time_per_iteration": 2.394282579421997 }, { "auxiliary_loss_clip": 0.01143821, "auxiliary_loss_mlp": 0.01048508, "balance_loss_clip": 1.02023184, "balance_loss_mlp": 1.03516436, "epoch": 0.07629640763565308, "flos": 14719521116160.0, "grad_norm": 2.747619032322646, "language_loss": 0.82369566, "learning_rate": 3.9429843419480755e-06, "loss": 0.84561896, "num_input_tokens_seen": 27004045, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.0859375, "step": 1269, "time_per_iteration": 2.429852247238159 }, { "auxiliary_loss_clip": 0.01141087, "auxiliary_loss_mlp": 0.01050495, "balance_loss_clip": 1.02332711, "balance_loss_mlp": 1.03607571, "epoch": 0.07635653088832106, "flos": 14902047037440.0, "grad_norm": 2.3775327902069257, "language_loss": 0.88504201, "learning_rate": 3.942894736320334e-06, "loss": 0.90695786, "num_input_tokens_seen": 27022070, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.046875, "step": 1270, "time_per_iteration": 2.413921356201172 }, { "auxiliary_loss_clip": 0.01145719, "auxiliary_loss_mlp": 0.01053373, "balance_loss_clip": 1.02552581, "balance_loss_mlp": 1.03685999, "epoch": 0.07641665414098903, "flos": 26650355518080.0, "grad_norm": 2.3193044847198054, "language_loss": 0.71426392, "learning_rate": 3.942805061355676e-06, "loss": 0.73625481, "num_input_tokens_seen": 27041755, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.0859375, "step": 1271, "time_per_iteration": 2.488715410232544 }, { "auxiliary_loss_clip": 0.01140245, "auxiliary_loss_mlp": 0.01051859, "balance_loss_clip": 1.02447629, "balance_loss_mlp": 1.03839946, "epoch": 0.07647677739365699, "flos": 25191614424960.0, "grad_norm": 1.6024138693327201, "language_loss": 0.82551324, "learning_rate": 3.9427153170573026e-06, "loss": 0.8474344, "num_input_tokens_seen": 27061540, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.015625, "step": 1272, "time_per_iteration": 2.4781711101531982 }, { "auxiliary_loss_clip": 0.01141554, "auxiliary_loss_mlp": 0.01053515, "balance_loss_clip": 1.02572775, "balance_loss_mlp": 1.0339067, "epoch": 0.07653690064632497, "flos": 20557103840640.0, "grad_norm": 4.691640888023605, "language_loss": 0.7996034, "learning_rate": 3.9426255034284174e-06, "loss": 0.82155412, "num_input_tokens_seen": 27081395, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.078125, "step": 1273, "time_per_iteration": 2.424798011779785 }, { "auxiliary_loss_clip": 0.01141954, "auxiliary_loss_mlp": 0.01055519, "balance_loss_clip": 1.0280174, "balance_loss_mlp": 1.03463483, "epoch": 0.07659702389899294, "flos": 22268336952960.0, "grad_norm": 2.39038723505383, "language_loss": 0.81201237, "learning_rate": 3.942535620472224e-06, "loss": 0.83398712, "num_input_tokens_seen": 27101175, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.078125, "step": 1274, "time_per_iteration": 2.4857890605926514 }, { "auxiliary_loss_clip": 0.01144799, "auxiliary_loss_mlp": 0.01058775, "balance_loss_clip": 1.03149986, "balance_loss_mlp": 1.03649175, "epoch": 0.0766571471516609, "flos": 32634713064960.0, "grad_norm": 2.102679445046312, "language_loss": 0.73003268, "learning_rate": 3.942445668191932e-06, "loss": 0.7520684, "num_input_tokens_seen": 27124505, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.078125, "step": 1275, "time_per_iteration": 2.533172845840454 }, { "auxiliary_loss_clip": 0.01145421, "auxiliary_loss_mlp": 0.01054721, "balance_loss_clip": 1.0252409, "balance_loss_mlp": 1.03737283, "epoch": 0.07671727040432888, "flos": 15266505386880.0, "grad_norm": 2.224435838407383, "language_loss": 0.79420996, "learning_rate": 3.94235564659075e-06, "loss": 0.81621134, "num_input_tokens_seen": 27140960, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.078125, "step": 1276, "time_per_iteration": 2.540464162826538 }, { "auxiliary_loss_clip": 0.0114624, "auxiliary_loss_mlp": 0.01051885, "balance_loss_clip": 1.0239898, "balance_loss_mlp": 1.03762674, "epoch": 0.07677739365699685, "flos": 28182833136000.0, "grad_norm": 2.2290197674949424, "language_loss": 0.59222054, "learning_rate": 3.942265555671892e-06, "loss": 0.61420172, "num_input_tokens_seen": 27160985, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.0859375, "step": 1277, "time_per_iteration": 2.5006768703460693 }, { "auxiliary_loss_clip": 0.01145807, "auxiliary_loss_mlp": 0.01057558, "balance_loss_clip": 1.02946019, "balance_loss_mlp": 1.03459835, "epoch": 0.07683751690966481, "flos": 18295150942080.0, "grad_norm": 3.6354157239331477, "language_loss": 0.75029022, "learning_rate": 3.942175395438572e-06, "loss": 0.77232379, "num_input_tokens_seen": 27178390, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.109375, "step": 1278, "time_per_iteration": 3.87268328666687 }, { "auxiliary_loss_clip": 0.01136778, "auxiliary_loss_mlp": 0.01051673, "balance_loss_clip": 1.02626991, "balance_loss_mlp": 1.03386188, "epoch": 0.07689764016233278, "flos": 21980024012160.0, "grad_norm": 2.747090015732431, "language_loss": 0.88341421, "learning_rate": 3.942085165894009e-06, "loss": 0.90529871, "num_input_tokens_seen": 27197505, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.03125, "step": 1279, "time_per_iteration": 2.445218086242676 }, { "auxiliary_loss_clip": 0.01139971, "auxiliary_loss_mlp": 0.01048246, "balance_loss_clip": 1.020661, "balance_loss_mlp": 1.03736174, "epoch": 0.07695776341500075, "flos": 22234924915200.0, "grad_norm": 2.417511747291879, "language_loss": 0.82531738, "learning_rate": 3.9419948670414206e-06, "loss": 0.84719956, "num_input_tokens_seen": 27214260, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.0234375, "step": 1280, "time_per_iteration": 2.472209930419922 }, { "auxiliary_loss_clip": 0.01140022, "auxiliary_loss_mlp": 0.01056315, "balance_loss_clip": 1.02799058, "balance_loss_mlp": 1.0359323, "epoch": 0.07701788666766872, "flos": 16142825312640.0, "grad_norm": 3.092359493923539, "language_loss": 0.75768244, "learning_rate": 3.941904498884032e-06, "loss": 0.7796458, "num_input_tokens_seen": 27232525, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.0390625, "step": 1281, "time_per_iteration": 4.002227067947388 }, { "auxiliary_loss_clip": 0.01144663, "auxiliary_loss_mlp": 0.0105056, "balance_loss_clip": 1.02142501, "balance_loss_mlp": 1.03378582, "epoch": 0.07707800992033668, "flos": 19462053047040.0, "grad_norm": 3.3006494765725862, "language_loss": 0.74827677, "learning_rate": 3.941814061425067e-06, "loss": 0.77022898, "num_input_tokens_seen": 27249800, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.109375, "step": 1282, "time_per_iteration": 3.923555374145508 }, { "auxiliary_loss_clip": 0.01143046, "auxiliary_loss_mlp": 0.01054118, "balance_loss_clip": 1.02739167, "balance_loss_mlp": 1.03525567, "epoch": 0.07713813317300466, "flos": 18989259148800.0, "grad_norm": 2.6287145402917154, "language_loss": 0.83850062, "learning_rate": 3.941723554667752e-06, "loss": 0.86047232, "num_input_tokens_seen": 27268895, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 1.078125, "step": 1283, "time_per_iteration": 2.483353853225708 }, { "auxiliary_loss_clip": 0.01142422, "auxiliary_loss_mlp": 0.01056764, "balance_loss_clip": 1.02675915, "balance_loss_mlp": 1.03554845, "epoch": 0.07719825642567263, "flos": 18112974134400.0, "grad_norm": 4.00045806440098, "language_loss": 0.74790585, "learning_rate": 3.941632978615318e-06, "loss": 0.7698977, "num_input_tokens_seen": 27288180, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.0703125, "step": 1284, "time_per_iteration": 2.4336962699890137 }, { "auxiliary_loss_clip": 0.01139409, "auxiliary_loss_mlp": 0.01058026, "balance_loss_clip": 1.03098965, "balance_loss_mlp": 1.03460002, "epoch": 0.0772583796783406, "flos": 42192780312960.0, "grad_norm": 1.9606313210845743, "language_loss": 0.76300985, "learning_rate": 3.941542333270999e-06, "loss": 0.78498423, "num_input_tokens_seen": 27311815, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.046875, "step": 1285, "time_per_iteration": 2.6642353534698486 }, { "auxiliary_loss_clip": 0.01146061, "auxiliary_loss_mlp": 0.01060958, "balance_loss_clip": 1.03332567, "balance_loss_mlp": 1.0383873, "epoch": 0.07731850293100857, "flos": 24752546766720.0, "grad_norm": 2.016961446663883, "language_loss": 0.83767694, "learning_rate": 3.9414516186380275e-06, "loss": 0.85974705, "num_input_tokens_seen": 27331890, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.078125, "step": 1286, "time_per_iteration": 2.4510388374328613 }, { "auxiliary_loss_clip": 0.01143516, "auxiliary_loss_mlp": 0.01049856, "balance_loss_clip": 1.02274823, "balance_loss_mlp": 1.03486085, "epoch": 0.07737862618367654, "flos": 17564942522880.0, "grad_norm": 2.2536591918310656, "language_loss": 0.770509, "learning_rate": 3.941360834719641e-06, "loss": 0.79244268, "num_input_tokens_seen": 27348320, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.0859375, "step": 1287, "time_per_iteration": 2.44064998626709 }, { "auxiliary_loss_clip": 0.01138396, "auxiliary_loss_mlp": 0.01049698, "balance_loss_clip": 1.0231142, "balance_loss_mlp": 1.03492832, "epoch": 0.0774387494363445, "flos": 25626038872320.0, "grad_norm": 2.0322938983618326, "language_loss": 0.84395438, "learning_rate": 3.941269981519081e-06, "loss": 0.86583531, "num_input_tokens_seen": 27367670, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.03125, "step": 1288, "time_per_iteration": 2.4835731983184814 }, { "auxiliary_loss_clip": 0.01138414, "auxiliary_loss_mlp": 0.01050989, "balance_loss_clip": 1.02489436, "balance_loss_mlp": 1.03338683, "epoch": 0.07749887268901248, "flos": 12239046817920.0, "grad_norm": 2.157235414053665, "language_loss": 0.85084462, "learning_rate": 3.941179059039589e-06, "loss": 0.87273872, "num_input_tokens_seen": 27385485, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.046875, "step": 1289, "time_per_iteration": 2.520219564437866 }, { "auxiliary_loss_clip": 0.01140713, "auxiliary_loss_mlp": 0.01047597, "balance_loss_clip": 1.01990509, "balance_loss_mlp": 1.03482342, "epoch": 0.07755899594168045, "flos": 25080590701440.0, "grad_norm": 1.9354442156770693, "language_loss": 0.85018635, "learning_rate": 3.941088067284409e-06, "loss": 0.87206948, "num_input_tokens_seen": 27405110, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.0625, "step": 1290, "time_per_iteration": 2.470550060272217 }, { "auxiliary_loss_clip": 0.01136217, "auxiliary_loss_mlp": 0.01061076, "balance_loss_clip": 1.03229904, "balance_loss_mlp": 1.03412962, "epoch": 0.07761911919434841, "flos": 14245540231680.0, "grad_norm": 2.307083349569191, "language_loss": 0.90523207, "learning_rate": 3.9409970062567895e-06, "loss": 0.92720503, "num_input_tokens_seen": 27422855, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.0234375, "step": 1291, "time_per_iteration": 2.440746784210205 }, { "auxiliary_loss_clip": 0.01041045, "auxiliary_loss_mlp": 0.01016423, "balance_loss_clip": 1.01241791, "balance_loss_mlp": 1.00740957, "epoch": 0.07767924244701638, "flos": 67233463597440.0, "grad_norm": 0.8793135900668437, "language_loss": 0.65063083, "learning_rate": 3.94090587595998e-06, "loss": 0.67120552, "num_input_tokens_seen": 27487190, "router_z_loss_clip": 0.04003906, "router_z_loss_mlp": 0.3359375, "step": 1292, "time_per_iteration": 3.168759822845459 }, { "auxiliary_loss_clip": 0.0114126, "auxiliary_loss_mlp": 0.01046543, "balance_loss_clip": 1.01861262, "balance_loss_mlp": 1.03376663, "epoch": 0.07773936569968436, "flos": 28549316344320.0, "grad_norm": 2.045054047714249, "language_loss": 0.87551838, "learning_rate": 3.940814676397232e-06, "loss": 0.89739639, "num_input_tokens_seen": 27510465, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.0703125, "step": 1293, "time_per_iteration": 2.5071475505828857 }, { "auxiliary_loss_clip": 0.01144298, "auxiliary_loss_mlp": 0.01053189, "balance_loss_clip": 1.02447128, "balance_loss_mlp": 1.03891051, "epoch": 0.07779948895235232, "flos": 27489039131520.0, "grad_norm": 2.1716084836194733, "language_loss": 0.84672004, "learning_rate": 3.940723407571801e-06, "loss": 0.8686949, "num_input_tokens_seen": 27528645, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.0546875, "step": 1294, "time_per_iteration": 2.5061745643615723 }, { "auxiliary_loss_clip": 0.01141525, "auxiliary_loss_mlp": 0.01048977, "balance_loss_clip": 1.02082014, "balance_loss_mlp": 1.03697085, "epoch": 0.07785961220502029, "flos": 18222322112640.0, "grad_norm": 2.381994821509741, "language_loss": 0.79361206, "learning_rate": 3.9406320694869425e-06, "loss": 0.81551707, "num_input_tokens_seen": 27546165, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.046875, "step": 1295, "time_per_iteration": 2.407604694366455 }, { "auxiliary_loss_clip": 0.01140814, "auxiliary_loss_mlp": 0.01047431, "balance_loss_clip": 1.02001262, "balance_loss_mlp": 1.03543675, "epoch": 0.07791973545768827, "flos": 24607063664640.0, "grad_norm": 2.319744815127164, "language_loss": 0.87795794, "learning_rate": 3.940540662145918e-06, "loss": 0.89984035, "num_input_tokens_seen": 27566520, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.0546875, "step": 1296, "time_per_iteration": 2.4614198207855225 }, { "auxiliary_loss_clip": 0.01141737, "auxiliary_loss_mlp": 0.01049657, "balance_loss_clip": 1.02112985, "balance_loss_mlp": 1.03465438, "epoch": 0.07797985871035623, "flos": 14281221507840.0, "grad_norm": 3.0816995918719856, "language_loss": 0.96446133, "learning_rate": 3.940449185551989e-06, "loss": 0.98637521, "num_input_tokens_seen": 27581960, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.0703125, "step": 1297, "time_per_iteration": 2.4358835220336914 }, { "auxiliary_loss_clip": 0.01140842, "auxiliary_loss_mlp": 0.01050633, "balance_loss_clip": 1.0230006, "balance_loss_mlp": 1.03406048, "epoch": 0.0780399819630242, "flos": 26609367715200.0, "grad_norm": 2.114068414625776, "language_loss": 0.7612232, "learning_rate": 3.94035763970842e-06, "loss": 0.78313792, "num_input_tokens_seen": 27601415, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.0703125, "step": 1298, "time_per_iteration": 2.452118396759033 }, { "auxiliary_loss_clip": 0.0114226, "auxiliary_loss_mlp": 0.01051076, "balance_loss_clip": 1.02457619, "balance_loss_mlp": 1.03616405, "epoch": 0.07810010521569218, "flos": 21833458657920.0, "grad_norm": 1.7562543677043454, "language_loss": 0.80491579, "learning_rate": 3.940266024618478e-06, "loss": 0.82684916, "num_input_tokens_seen": 27621490, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.0625, "step": 1299, "time_per_iteration": 2.448579788208008 }, { "auxiliary_loss_clip": 0.01138035, "auxiliary_loss_mlp": 0.01050805, "balance_loss_clip": 1.0229218, "balance_loss_mlp": 1.03488827, "epoch": 0.07816022846836014, "flos": 25080101942400.0, "grad_norm": 2.063325759243455, "language_loss": 0.85981327, "learning_rate": 3.940174340285432e-06, "loss": 0.88170165, "num_input_tokens_seen": 27640600, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.03125, "step": 1300, "time_per_iteration": 2.4218318462371826 }, { "auxiliary_loss_clip": 0.0114285, "auxiliary_loss_mlp": 0.01046773, "balance_loss_clip": 1.01909256, "balance_loss_mlp": 1.03690553, "epoch": 0.0782203517210281, "flos": 40915901825280.0, "grad_norm": 2.2657372489361336, "language_loss": 0.71694589, "learning_rate": 3.940082586712555e-06, "loss": 0.73884213, "num_input_tokens_seen": 27663070, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.0625, "step": 1301, "time_per_iteration": 2.6418845653533936 }, { "auxiliary_loss_clip": 0.01147725, "auxiliary_loss_mlp": 0.01058281, "balance_loss_clip": 1.03042173, "balance_loss_mlp": 1.03863072, "epoch": 0.07828047497369607, "flos": 41170418703360.0, "grad_norm": 1.5101327011812837, "language_loss": 0.7031014, "learning_rate": 3.939990763903122e-06, "loss": 0.72516143, "num_input_tokens_seen": 27686425, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.0859375, "step": 1302, "time_per_iteration": 2.741868734359741 }, { "auxiliary_loss_clip": 0.01142976, "auxiliary_loss_mlp": 0.01052096, "balance_loss_clip": 1.02437949, "balance_loss_mlp": 1.03674889, "epoch": 0.07834059822636405, "flos": 23507160192000.0, "grad_norm": 1.988705315306982, "language_loss": 0.82179976, "learning_rate": 3.939898871860407e-06, "loss": 0.84375048, "num_input_tokens_seen": 27704900, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.0625, "step": 1303, "time_per_iteration": 2.4571609497070312 }, { "auxiliary_loss_clip": 0.01142584, "auxiliary_loss_mlp": 0.01059815, "balance_loss_clip": 1.03139567, "balance_loss_mlp": 1.03558779, "epoch": 0.07840072147903202, "flos": 20192854959360.0, "grad_norm": 2.581953731053822, "language_loss": 0.74705011, "learning_rate": 3.939806910587693e-06, "loss": 0.76907408, "num_input_tokens_seen": 27724890, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.0703125, "step": 1304, "time_per_iteration": 2.433095932006836 }, { "auxiliary_loss_clip": 0.01146743, "auxiliary_loss_mlp": 0.01056951, "balance_loss_clip": 1.02690983, "balance_loss_mlp": 1.04023051, "epoch": 0.07846084473169998, "flos": 21359757064320.0, "grad_norm": 1.8012610446750759, "language_loss": 0.76330793, "learning_rate": 3.9397148800882595e-06, "loss": 0.78534484, "num_input_tokens_seen": 27743115, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.0625, "step": 1305, "time_per_iteration": 2.4251163005828857 }, { "auxiliary_loss_clip": 0.01145606, "auxiliary_loss_mlp": 0.01058712, "balance_loss_clip": 1.02882612, "balance_loss_mlp": 1.03599501, "epoch": 0.07852096798436796, "flos": 25409786711040.0, "grad_norm": 1.709212760868719, "language_loss": 0.84957409, "learning_rate": 3.939622780365391e-06, "loss": 0.87161732, "num_input_tokens_seen": 27763570, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.09375, "step": 1306, "time_per_iteration": 2.4500534534454346 }, { "auxiliary_loss_clip": 0.01141706, "auxiliary_loss_mlp": 0.01042407, "balance_loss_clip": 1.01584721, "balance_loss_mlp": 1.03744817, "epoch": 0.07858109123703592, "flos": 24570335047680.0, "grad_norm": 2.6426056403295197, "language_loss": 0.9069171, "learning_rate": 3.939530611422375e-06, "loss": 0.92875826, "num_input_tokens_seen": 27780030, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.0390625, "step": 1307, "time_per_iteration": 2.4576053619384766 }, { "auxiliary_loss_clip": 0.01142213, "auxiliary_loss_mlp": 0.01051842, "balance_loss_clip": 1.02225399, "balance_loss_mlp": 1.03556371, "epoch": 0.07864121448970389, "flos": 20697978643200.0, "grad_norm": 1.8719671173611063, "language_loss": 0.8353464, "learning_rate": 3.939438373262501e-06, "loss": 0.85728693, "num_input_tokens_seen": 27796225, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.0625, "step": 1308, "time_per_iteration": 2.3935189247131348 }, { "auxiliary_loss_clip": 0.01139905, "auxiliary_loss_mlp": 0.0104803, "balance_loss_clip": 1.02133918, "balance_loss_mlp": 1.03627121, "epoch": 0.07870133774237187, "flos": 22965412625280.0, "grad_norm": 1.4422409899536226, "language_loss": 0.77097666, "learning_rate": 3.93934606588906e-06, "loss": 0.79285604, "num_input_tokens_seen": 27815975, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.0390625, "step": 1309, "time_per_iteration": 2.4520504474639893 }, { "auxiliary_loss_clip": 0.01148205, "auxiliary_loss_mlp": 0.01060707, "balance_loss_clip": 1.03188252, "balance_loss_mlp": 1.03713357, "epoch": 0.07876146099503983, "flos": 18841855921920.0, "grad_norm": 2.109083590132941, "language_loss": 0.80204201, "learning_rate": 3.939253689305346e-06, "loss": 0.82413113, "num_input_tokens_seen": 27832255, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.109375, "step": 1310, "time_per_iteration": 2.374415397644043 }, { "auxiliary_loss_clip": 0.01138736, "auxiliary_loss_mlp": 0.01049991, "balance_loss_clip": 1.02394342, "balance_loss_mlp": 1.03848791, "epoch": 0.0788215842477078, "flos": 23804654820480.0, "grad_norm": 1.6889419547689029, "language_loss": 0.72608209, "learning_rate": 3.939161243514657e-06, "loss": 0.74796939, "num_input_tokens_seen": 27852180, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.0078125, "step": 1311, "time_per_iteration": 2.4493556022644043 }, { "auxiliary_loss_clip": 0.01143532, "auxiliary_loss_mlp": 0.01063003, "balance_loss_clip": 1.03531039, "balance_loss_mlp": 1.03952634, "epoch": 0.07888170750037576, "flos": 21578837045760.0, "grad_norm": 3.1876067717240892, "language_loss": 0.85806346, "learning_rate": 3.939068728520291e-06, "loss": 0.88012886, "num_input_tokens_seen": 27871435, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.0390625, "step": 1312, "time_per_iteration": 2.4188222885131836 }, { "auxiliary_loss_clip": 0.01141501, "auxiliary_loss_mlp": 0.01058017, "balance_loss_clip": 1.03069377, "balance_loss_mlp": 1.03968775, "epoch": 0.07894183075304374, "flos": 19863833506560.0, "grad_norm": 2.420747931174189, "language_loss": 0.81749922, "learning_rate": 3.938976144325549e-06, "loss": 0.83949447, "num_input_tokens_seen": 27890625, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.015625, "step": 1313, "time_per_iteration": 2.436534881591797 }, { "auxiliary_loss_clip": 0.01149076, "auxiliary_loss_mlp": 0.01059925, "balance_loss_clip": 1.02850103, "balance_loss_mlp": 1.03694129, "epoch": 0.07900195400571171, "flos": 16142546021760.0, "grad_norm": 2.417584199048771, "language_loss": 0.72915339, "learning_rate": 3.9388834909337375e-06, "loss": 0.75124347, "num_input_tokens_seen": 27906530, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 1.1171875, "step": 1314, "time_per_iteration": 2.3856356143951416 }, { "auxiliary_loss_clip": 0.01141154, "auxiliary_loss_mlp": 0.01053615, "balance_loss_clip": 1.02693546, "balance_loss_mlp": 1.03433251, "epoch": 0.07906207725837967, "flos": 23729347284480.0, "grad_norm": 1.614323519360908, "language_loss": 0.79576534, "learning_rate": 3.938790768348161e-06, "loss": 0.81771302, "num_input_tokens_seen": 27926725, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 1.0703125, "step": 1315, "time_per_iteration": 2.4572107791900635 }, { "auxiliary_loss_clip": 0.01140939, "auxiliary_loss_mlp": 0.01057704, "balance_loss_clip": 1.02779472, "balance_loss_mlp": 1.03500342, "epoch": 0.07912220051104765, "flos": 24314770828800.0, "grad_norm": 1.9746038132442256, "language_loss": 0.73879462, "learning_rate": 3.938697976572129e-06, "loss": 0.76078105, "num_input_tokens_seen": 27947875, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.0625, "step": 1316, "time_per_iteration": 2.4434773921966553 }, { "auxiliary_loss_clip": 0.01147625, "auxiliary_loss_mlp": 0.01056223, "balance_loss_clip": 1.02733898, "balance_loss_mlp": 1.03697991, "epoch": 0.07918232376371562, "flos": 18879038386560.0, "grad_norm": 3.399225985719132, "language_loss": 0.65351379, "learning_rate": 3.938605115608954e-06, "loss": 0.67555225, "num_input_tokens_seen": 27965040, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.1015625, "step": 1317, "time_per_iteration": 2.584772825241089 }, { "auxiliary_loss_clip": 0.01148, "auxiliary_loss_mlp": 0.01062528, "balance_loss_clip": 1.03260612, "balance_loss_mlp": 1.03737366, "epoch": 0.07924244701638358, "flos": 27375187587840.0, "grad_norm": 2.590912805255077, "language_loss": 0.7312218, "learning_rate": 3.938512185461948e-06, "loss": 0.75332707, "num_input_tokens_seen": 27985330, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.109375, "step": 1318, "time_per_iteration": 3.980745315551758 }, { "auxiliary_loss_clip": 0.01143958, "auxiliary_loss_mlp": 0.0105347, "balance_loss_clip": 1.02559924, "balance_loss_mlp": 1.03717411, "epoch": 0.07930257026905156, "flos": 25119134709120.0, "grad_norm": 1.67985434599967, "language_loss": 0.90111381, "learning_rate": 3.938419186134429e-06, "loss": 0.92308807, "num_input_tokens_seen": 28007615, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.0625, "step": 1319, "time_per_iteration": 2.4653007984161377 }, { "auxiliary_loss_clip": 0.01142388, "auxiliary_loss_mlp": 0.01056527, "balance_loss_clip": 1.02764225, "balance_loss_mlp": 1.03432322, "epoch": 0.07936269352171953, "flos": 21833423746560.0, "grad_norm": 1.8351091050344135, "language_loss": 0.79586965, "learning_rate": 3.9383261176297155e-06, "loss": 0.81785882, "num_input_tokens_seen": 28027765, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.078125, "step": 1320, "time_per_iteration": 3.920444965362549 }, { "auxiliary_loss_clip": 0.01143597, "auxiliary_loss_mlp": 0.01056051, "balance_loss_clip": 1.02728558, "balance_loss_mlp": 1.03809261, "epoch": 0.07942281677438749, "flos": 16939124668800.0, "grad_norm": 2.9078392525385057, "language_loss": 0.69522524, "learning_rate": 3.938232979951129e-06, "loss": 0.71722168, "num_input_tokens_seen": 28044225, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.0546875, "step": 1321, "time_per_iteration": 5.2817018032073975 }, { "auxiliary_loss_clip": 0.01140461, "auxiliary_loss_mlp": 0.01058528, "balance_loss_clip": 1.03015637, "balance_loss_mlp": 1.03602624, "epoch": 0.07948294002705546, "flos": 18986012392320.0, "grad_norm": 2.1799705885269205, "language_loss": 0.84114683, "learning_rate": 3.938139773101993e-06, "loss": 0.86313665, "num_input_tokens_seen": 28062915, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.046875, "step": 1322, "time_per_iteration": 2.416301965713501 }, { "auxiliary_loss_clip": 0.01139827, "auxiliary_loss_mlp": 0.01054368, "balance_loss_clip": 1.0266279, "balance_loss_mlp": 1.03306556, "epoch": 0.07954306327972344, "flos": 21652364102400.0, "grad_norm": 2.3530515704260577, "language_loss": 0.90426469, "learning_rate": 3.938046497085634e-06, "loss": 0.92620659, "num_input_tokens_seen": 28082175, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.0703125, "step": 1323, "time_per_iteration": 2.456897735595703 }, { "auxiliary_loss_clip": 0.01137344, "auxiliary_loss_mlp": 0.01053257, "balance_loss_clip": 1.02537346, "balance_loss_mlp": 1.0350616, "epoch": 0.0796031865323914, "flos": 23219196364800.0, "grad_norm": 1.726783845318455, "language_loss": 0.82554126, "learning_rate": 3.937953151905381e-06, "loss": 0.84744722, "num_input_tokens_seen": 28102645, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.0234375, "step": 1324, "time_per_iteration": 2.426100730895996 }, { "auxiliary_loss_clip": 0.01141442, "auxiliary_loss_mlp": 0.01052664, "balance_loss_clip": 1.02341056, "balance_loss_mlp": 1.03469324, "epoch": 0.07966330978505937, "flos": 23293421648640.0, "grad_norm": 4.397938593012299, "language_loss": 0.79089087, "learning_rate": 3.937859737564564e-06, "loss": 0.81283194, "num_input_tokens_seen": 28122805, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.0703125, "step": 1325, "time_per_iteration": 2.5013134479522705 }, { "auxiliary_loss_clip": 0.01143631, "auxiliary_loss_mlp": 0.01060608, "balance_loss_clip": 1.03113937, "balance_loss_mlp": 1.03763437, "epoch": 0.07972343303772735, "flos": 18362952535680.0, "grad_norm": 2.3257959431325914, "language_loss": 0.88397908, "learning_rate": 3.937766254066519e-06, "loss": 0.9060216, "num_input_tokens_seen": 28140530, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.0625, "step": 1326, "time_per_iteration": 2.4037437438964844 }, { "auxiliary_loss_clip": 0.01137361, "auxiliary_loss_mlp": 0.01047272, "balance_loss_clip": 1.01801825, "balance_loss_mlp": 1.03456819, "epoch": 0.07978355629039531, "flos": 21761432789760.0, "grad_norm": 2.0451963149136407, "language_loss": 0.83130109, "learning_rate": 3.937672701414581e-06, "loss": 0.85314745, "num_input_tokens_seen": 28159640, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.03125, "step": 1327, "time_per_iteration": 2.4422802925109863 }, { "auxiliary_loss_clip": 0.01141986, "auxiliary_loss_mlp": 0.01053909, "balance_loss_clip": 1.02343893, "balance_loss_mlp": 1.03526497, "epoch": 0.07984367954306328, "flos": 18550331136000.0, "grad_norm": 2.109895088853339, "language_loss": 0.78819835, "learning_rate": 3.937579079612087e-06, "loss": 0.81015736, "num_input_tokens_seen": 28177050, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.0625, "step": 1328, "time_per_iteration": 2.3991034030914307 }, { "auxiliary_loss_clip": 0.01143765, "auxiliary_loss_mlp": 0.01053721, "balance_loss_clip": 1.02444315, "balance_loss_mlp": 1.03603315, "epoch": 0.07990380279573125, "flos": 16903268835840.0, "grad_norm": 2.470183114375481, "language_loss": 0.7324903, "learning_rate": 3.9374853886623805e-06, "loss": 0.75446516, "num_input_tokens_seen": 28193245, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.078125, "step": 1329, "time_per_iteration": 2.444092035293579 }, { "auxiliary_loss_clip": 0.01138578, "auxiliary_loss_mlp": 0.0104502, "balance_loss_clip": 1.01756668, "balance_loss_mlp": 1.03349066, "epoch": 0.07996392604839922, "flos": 24097192035840.0, "grad_norm": 1.7260779122665668, "language_loss": 0.8116973, "learning_rate": 3.937391628568805e-06, "loss": 0.83353329, "num_input_tokens_seen": 28213570, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.046875, "step": 1330, "time_per_iteration": 2.4590070247650146 }, { "auxiliary_loss_clip": 0.01139686, "auxiliary_loss_mlp": 0.01050342, "balance_loss_clip": 1.02336502, "balance_loss_mlp": 1.03483081, "epoch": 0.08002404930106718, "flos": 14277974751360.0, "grad_norm": 5.953708099505188, "language_loss": 0.88954514, "learning_rate": 3.937297799334706e-06, "loss": 0.9114455, "num_input_tokens_seen": 28229980, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.046875, "step": 1331, "time_per_iteration": 2.418520212173462 }, { "auxiliary_loss_clip": 0.01143133, "auxiliary_loss_mlp": 0.01050024, "balance_loss_clip": 1.02038908, "balance_loss_mlp": 1.03411341, "epoch": 0.08008417255373516, "flos": 40404633742080.0, "grad_norm": 1.9768105749615845, "language_loss": 0.73450077, "learning_rate": 3.937203900963431e-06, "loss": 0.75643235, "num_input_tokens_seen": 28253840, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.09375, "step": 1332, "time_per_iteration": 2.569986343383789 }, { "auxiliary_loss_clip": 0.01139288, "auxiliary_loss_mlp": 0.01047808, "balance_loss_clip": 1.01986504, "balance_loss_mlp": 1.03435731, "epoch": 0.08014429580640313, "flos": 18477921242880.0, "grad_norm": 1.9158417669999368, "language_loss": 0.82308197, "learning_rate": 3.9371099334583315e-06, "loss": 0.84495294, "num_input_tokens_seen": 28271675, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.046875, "step": 1333, "time_per_iteration": 2.416245460510254 }, { "auxiliary_loss_clip": 0.0114136, "auxiliary_loss_mlp": 0.01049533, "balance_loss_clip": 1.0222342, "balance_loss_mlp": 1.03392458, "epoch": 0.0802044190590711, "flos": 22052398993920.0, "grad_norm": 2.22853006873138, "language_loss": 0.74815822, "learning_rate": 3.937015896822762e-06, "loss": 0.7700671, "num_input_tokens_seen": 28291850, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.078125, "step": 1334, "time_per_iteration": 2.415252685546875 }, { "auxiliary_loss_clip": 0.01139333, "auxiliary_loss_mlp": 0.01049732, "balance_loss_clip": 1.02207494, "balance_loss_mlp": 1.03632402, "epoch": 0.08026454231173906, "flos": 24570963452160.0, "grad_norm": 1.8146781566640344, "language_loss": 0.80229247, "learning_rate": 3.936921791060078e-06, "loss": 0.82418305, "num_input_tokens_seen": 28310780, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.03125, "step": 1335, "time_per_iteration": 2.476264476776123 }, { "auxiliary_loss_clip": 0.01042149, "auxiliary_loss_mlp": 0.01018921, "balance_loss_clip": 1.01470125, "balance_loss_mlp": 1.00973916, "epoch": 0.08032466556440704, "flos": 52579195545600.0, "grad_norm": 0.7418359836964853, "language_loss": 0.5600881, "learning_rate": 3.936827616173636e-06, "loss": 0.58069885, "num_input_tokens_seen": 28369985, "router_z_loss_clip": 0.04223633, "router_z_loss_mlp": 0.32421875, "step": 1336, "time_per_iteration": 3.1065118312835693 }, { "auxiliary_loss_clip": 0.01139751, "auxiliary_loss_mlp": 0.01056251, "balance_loss_clip": 1.02970338, "balance_loss_mlp": 1.0362134, "epoch": 0.080384788817075, "flos": 23841453260160.0, "grad_norm": 2.0979713849682615, "language_loss": 0.67448568, "learning_rate": 3.9367333721668006e-06, "loss": 0.6964457, "num_input_tokens_seen": 28388670, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.03125, "step": 1337, "time_per_iteration": 2.450666904449463 }, { "auxiliary_loss_clip": 0.01140381, "auxiliary_loss_mlp": 0.01054173, "balance_loss_clip": 1.02661204, "balance_loss_mlp": 1.03676331, "epoch": 0.08044491206974297, "flos": 25299565948800.0, "grad_norm": 2.2552804727475815, "language_loss": 0.86439645, "learning_rate": 3.936639059042932e-06, "loss": 0.88634193, "num_input_tokens_seen": 28411845, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.0390625, "step": 1338, "time_per_iteration": 2.505598783493042 }, { "auxiliary_loss_clip": 0.01138751, "auxiliary_loss_mlp": 0.01067749, "balance_loss_clip": 1.03838754, "balance_loss_mlp": 1.03342259, "epoch": 0.08050503532241095, "flos": 22375625160960.0, "grad_norm": 4.359292809857966, "language_loss": 0.87319863, "learning_rate": 3.936544676805397e-06, "loss": 0.89526367, "num_input_tokens_seen": 28427875, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.0546875, "step": 1339, "time_per_iteration": 2.425258159637451 }, { "auxiliary_loss_clip": 0.01133709, "auxiliary_loss_mlp": 0.01047137, "balance_loss_clip": 1.02039886, "balance_loss_mlp": 1.03435993, "epoch": 0.08056515857507891, "flos": 18368433619200.0, "grad_norm": 2.1293160007814826, "language_loss": 0.89519572, "learning_rate": 3.936450225457564e-06, "loss": 0.91700423, "num_input_tokens_seen": 28446615, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.9921875, "step": 1340, "time_per_iteration": 2.40386700630188 }, { "auxiliary_loss_clip": 0.01036387, "auxiliary_loss_mlp": 0.01003174, "balance_loss_clip": 0.99928826, "balance_loss_mlp": 1.00538206, "epoch": 0.08062528182774688, "flos": 51345329719680.0, "grad_norm": 0.8708482508804375, "language_loss": 0.64813459, "learning_rate": 3.936355705002804e-06, "loss": 0.66853023, "num_input_tokens_seen": 28505290, "router_z_loss_clip": 0.03881836, "router_z_loss_mlp": 0.31054688, "step": 1341, "time_per_iteration": 3.0302042961120605 }, { "auxiliary_loss_clip": 0.01145959, "auxiliary_loss_mlp": 0.01054501, "balance_loss_clip": 1.02598643, "balance_loss_mlp": 1.03534555, "epoch": 0.08068540508041486, "flos": 17598843319680.0, "grad_norm": 2.07789975291421, "language_loss": 0.89729524, "learning_rate": 3.936261115444489e-06, "loss": 0.91929984, "num_input_tokens_seen": 28522735, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.109375, "step": 1342, "time_per_iteration": 2.4231953620910645 }, { "auxiliary_loss_clip": 0.01147974, "auxiliary_loss_mlp": 0.01057106, "balance_loss_clip": 1.02868652, "balance_loss_mlp": 1.03958821, "epoch": 0.08074552833308282, "flos": 10560422782080.0, "grad_norm": 2.5990374723313217, "language_loss": 0.76440805, "learning_rate": 3.936166456785997e-06, "loss": 0.78645885, "num_input_tokens_seen": 28539460, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.078125, "step": 1343, "time_per_iteration": 2.42657470703125 }, { "auxiliary_loss_clip": 0.0103502, "auxiliary_loss_mlp": 0.01009421, "balance_loss_clip": 1.00532043, "balance_loss_mlp": 1.00449371, "epoch": 0.08080565158575079, "flos": 60837026739840.0, "grad_norm": 0.8016447465790755, "language_loss": 0.57401437, "learning_rate": 3.936071729030702e-06, "loss": 0.59445882, "num_input_tokens_seen": 28599855, "router_z_loss_clip": 0.04101562, "router_z_loss_mlp": 0.3046875, "step": 1344, "time_per_iteration": 3.0285229682922363 }, { "auxiliary_loss_clip": 0.01142824, "auxiliary_loss_mlp": 0.01056169, "balance_loss_clip": 1.02847719, "balance_loss_mlp": 1.03656745, "epoch": 0.08086577483841875, "flos": 18331390800000.0, "grad_norm": 3.3861312365131355, "language_loss": 0.86296439, "learning_rate": 3.935976932181989e-06, "loss": 0.88495433, "num_input_tokens_seen": 28617585, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.0625, "step": 1345, "time_per_iteration": 2.429169178009033 }, { "auxiliary_loss_clip": 0.0113923, "auxiliary_loss_mlp": 0.01056697, "balance_loss_clip": 1.03047085, "balance_loss_mlp": 1.03582883, "epoch": 0.08092589809108673, "flos": 21542527365120.0, "grad_norm": 1.8247206662094533, "language_loss": 0.87417907, "learning_rate": 3.935882066243239e-06, "loss": 0.89613831, "num_input_tokens_seen": 28636355, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.03125, "step": 1346, "time_per_iteration": 2.438117027282715 }, { "auxiliary_loss_clip": 0.01140205, "auxiliary_loss_mlp": 0.01051821, "balance_loss_clip": 1.02616668, "balance_loss_mlp": 1.03634501, "epoch": 0.0809860213437547, "flos": 22126903568640.0, "grad_norm": 1.9378568526291882, "language_loss": 0.92655408, "learning_rate": 3.935787131217838e-06, "loss": 0.94847435, "num_input_tokens_seen": 28656260, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.0390625, "step": 1347, "time_per_iteration": 2.4197306632995605 }, { "auxiliary_loss_clip": 0.01136978, "auxiliary_loss_mlp": 0.01047197, "balance_loss_clip": 1.01831245, "balance_loss_mlp": 1.03443682, "epoch": 0.08104614459642266, "flos": 21724424881920.0, "grad_norm": 2.006159913199672, "language_loss": 0.89071, "learning_rate": 3.9356921271091734e-06, "loss": 0.91255176, "num_input_tokens_seen": 28675865, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.0234375, "step": 1348, "time_per_iteration": 2.4586288928985596 }, { "auxiliary_loss_clip": 0.01137366, "auxiliary_loss_mlp": 0.01047615, "balance_loss_clip": 1.02150774, "balance_loss_mlp": 1.0380547, "epoch": 0.08110626784909064, "flos": 23950731415680.0, "grad_norm": 1.9698497285486793, "language_loss": 0.76631665, "learning_rate": 3.935597053920635e-06, "loss": 0.7881664, "num_input_tokens_seen": 28696255, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.9921875, "step": 1349, "time_per_iteration": 2.4439492225646973 }, { "auxiliary_loss_clip": 0.01140006, "auxiliary_loss_mlp": 0.0105312, "balance_loss_clip": 1.02532005, "balance_loss_mlp": 1.03573346, "epoch": 0.0811663911017586, "flos": 19024696045440.0, "grad_norm": 2.4233555394337256, "language_loss": 0.88450396, "learning_rate": 3.935501911655618e-06, "loss": 0.90643525, "num_input_tokens_seen": 28713905, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.0390625, "step": 1350, "time_per_iteration": 2.412860631942749 }, { "auxiliary_loss_clip": 0.01135942, "auxiliary_loss_mlp": 0.01058571, "balance_loss_clip": 1.03073609, "balance_loss_mlp": 1.03386545, "epoch": 0.08122651435442657, "flos": 15340381557120.0, "grad_norm": 2.1531664901380094, "language_loss": 0.8194319, "learning_rate": 3.935406700317516e-06, "loss": 0.84137702, "num_input_tokens_seen": 28732075, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.0234375, "step": 1351, "time_per_iteration": 2.4073026180267334 }, { "auxiliary_loss_clip": 0.011395, "auxiliary_loss_mlp": 0.01051922, "balance_loss_clip": 1.02209556, "balance_loss_mlp": 1.03398967, "epoch": 0.08128663760709455, "flos": 23220453173760.0, "grad_norm": 2.489233034819534, "language_loss": 0.75422478, "learning_rate": 3.935311419909728e-06, "loss": 0.77613902, "num_input_tokens_seen": 28751150, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.0546875, "step": 1352, "time_per_iteration": 2.5176920890808105 }, { "auxiliary_loss_clip": 0.01141523, "auxiliary_loss_mlp": 0.01056237, "balance_loss_clip": 1.02710199, "balance_loss_mlp": 1.03585172, "epoch": 0.08134676085976252, "flos": 22964539841280.0, "grad_norm": 1.82520977696158, "language_loss": 0.83126086, "learning_rate": 3.935216070435652e-06, "loss": 0.85323852, "num_input_tokens_seen": 28773360, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.0546875, "step": 1353, "time_per_iteration": 2.468277931213379 }, { "auxiliary_loss_clip": 0.01033342, "auxiliary_loss_mlp": 0.01007523, "balance_loss_clip": 1.00339806, "balance_loss_mlp": 1.00308514, "epoch": 0.08140688411243048, "flos": 64319369679360.0, "grad_norm": 0.8483378491422867, "language_loss": 0.59735012, "learning_rate": 3.935120651898694e-06, "loss": 0.61775875, "num_input_tokens_seen": 28833390, "router_z_loss_clip": 0.04125977, "router_z_loss_mlp": 0.30273438, "step": 1354, "time_per_iteration": 3.0853233337402344 }, { "auxiliary_loss_clip": 0.01136525, "auxiliary_loss_mlp": 0.01052364, "balance_loss_clip": 1.02514863, "balance_loss_mlp": 1.03431463, "epoch": 0.08146700736509845, "flos": 22490768424960.0, "grad_norm": 1.8495339968426354, "language_loss": 0.82956147, "learning_rate": 3.935025164302257e-06, "loss": 0.85145044, "num_input_tokens_seen": 28852430, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.0234375, "step": 1355, "time_per_iteration": 2.464796781539917 }, { "auxiliary_loss_clip": 0.01139687, "auxiliary_loss_mlp": 0.01059226, "balance_loss_clip": 1.0298171, "balance_loss_mlp": 1.03361726, "epoch": 0.08152713061776642, "flos": 20446813255680.0, "grad_norm": 1.9276991028738168, "language_loss": 0.7095387, "learning_rate": 3.934929607649749e-06, "loss": 0.73152781, "num_input_tokens_seen": 28870685, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.0625, "step": 1356, "time_per_iteration": 2.4853670597076416 }, { "auxiliary_loss_clip": 0.01139779, "auxiliary_loss_mlp": 0.01051055, "balance_loss_clip": 1.02342236, "balance_loss_mlp": 1.03437948, "epoch": 0.08158725387043439, "flos": 23549090601600.0, "grad_norm": 1.8537448997099917, "language_loss": 0.70516974, "learning_rate": 3.934833981944582e-06, "loss": 0.72707808, "num_input_tokens_seen": 28889860, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.0546875, "step": 1357, "time_per_iteration": 2.4754977226257324 }, { "auxiliary_loss_clip": 0.0114063, "auxiliary_loss_mlp": 0.01053673, "balance_loss_clip": 1.02531266, "balance_loss_mlp": 1.03652501, "epoch": 0.08164737712310235, "flos": 22016263870080.0, "grad_norm": 2.1873732431405237, "language_loss": 0.84406656, "learning_rate": 3.934738287190166e-06, "loss": 0.86600959, "num_input_tokens_seen": 28905865, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.0390625, "step": 1358, "time_per_iteration": 3.949376106262207 }, { "auxiliary_loss_clip": 0.01141411, "auxiliary_loss_mlp": 0.01053594, "balance_loss_clip": 1.02345777, "balance_loss_mlp": 1.03640282, "epoch": 0.08170750037577033, "flos": 23366704325760.0, "grad_norm": 2.0468282200358843, "language_loss": 1.0262934, "learning_rate": 3.934642523389917e-06, "loss": 1.04824352, "num_input_tokens_seen": 28925250, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.0546875, "step": 1359, "time_per_iteration": 3.8985743522644043 }, { "auxiliary_loss_clip": 0.01136888, "auxiliary_loss_mlp": 0.01049909, "balance_loss_clip": 1.02070248, "balance_loss_mlp": 1.03373373, "epoch": 0.0817676236284383, "flos": 28396850970240.0, "grad_norm": 2.0307171832386377, "language_loss": 0.83083647, "learning_rate": 3.934546690547253e-06, "loss": 0.85270447, "num_input_tokens_seen": 28943445, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.03125, "step": 1360, "time_per_iteration": 2.471463680267334 }, { "auxiliary_loss_clip": 0.01140642, "auxiliary_loss_mlp": 0.01051129, "balance_loss_clip": 1.02199435, "balance_loss_mlp": 1.03499973, "epoch": 0.08182774688110626, "flos": 19207885282560.0, "grad_norm": 2.2149181348842157, "language_loss": 0.72330105, "learning_rate": 3.934450788665594e-06, "loss": 0.74521875, "num_input_tokens_seen": 28962695, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.0546875, "step": 1361, "time_per_iteration": 5.282749176025391 }, { "auxiliary_loss_clip": 0.01132788, "auxiliary_loss_mlp": 0.01056152, "balance_loss_clip": 1.02720809, "balance_loss_mlp": 1.03063273, "epoch": 0.08188787013377424, "flos": 22782991438080.0, "grad_norm": 2.91924500043117, "language_loss": 0.76753962, "learning_rate": 3.934354817748363e-06, "loss": 0.78942901, "num_input_tokens_seen": 28982120, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.0234375, "step": 1362, "time_per_iteration": 2.4537808895111084 }, { "auxiliary_loss_clip": 0.01139277, "auxiliary_loss_mlp": 0.01047336, "balance_loss_clip": 1.01981115, "balance_loss_mlp": 1.03720856, "epoch": 0.08194799338644221, "flos": 16467273377280.0, "grad_norm": 2.5230938950134862, "language_loss": 0.7296077, "learning_rate": 3.934258777798984e-06, "loss": 0.75147378, "num_input_tokens_seen": 28998100, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.0234375, "step": 1363, "time_per_iteration": 2.3879504203796387 }, { "auxiliary_loss_clip": 0.01139257, "auxiliary_loss_mlp": 0.01046154, "balance_loss_clip": 1.01670909, "balance_loss_mlp": 1.03688741, "epoch": 0.08200811663911017, "flos": 23912536521600.0, "grad_norm": 2.0120773302425747, "language_loss": 0.77598512, "learning_rate": 3.934162668820884e-06, "loss": 0.79783922, "num_input_tokens_seen": 29017095, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.0234375, "step": 1364, "time_per_iteration": 2.460463762283325 }, { "auxiliary_loss_clip": 0.01139736, "auxiliary_loss_mlp": 0.01051396, "balance_loss_clip": 1.02369142, "balance_loss_mlp": 1.03548634, "epoch": 0.08206823989177814, "flos": 17895534986880.0, "grad_norm": 11.193197151022844, "language_loss": 0.81889302, "learning_rate": 3.934066490817495e-06, "loss": 0.84080428, "num_input_tokens_seen": 29037240, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.046875, "step": 1365, "time_per_iteration": 2.552370548248291 }, { "auxiliary_loss_clip": 0.01138806, "auxiliary_loss_mlp": 0.01046396, "balance_loss_clip": 1.01888299, "balance_loss_mlp": 1.03748155, "epoch": 0.08212836314444612, "flos": 22087172574720.0, "grad_norm": 2.109783001922283, "language_loss": 0.82045788, "learning_rate": 3.9339702437922465e-06, "loss": 0.84230983, "num_input_tokens_seen": 29056250, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.015625, "step": 1366, "time_per_iteration": 2.615504264831543 }, { "auxiliary_loss_clip": 0.01138705, "auxiliary_loss_mlp": 0.010455, "balance_loss_clip": 1.02046609, "balance_loss_mlp": 1.03385198, "epoch": 0.08218848639711408, "flos": 17596678815360.0, "grad_norm": 1.770323620790888, "language_loss": 0.81591201, "learning_rate": 3.933873927748575e-06, "loss": 0.83775401, "num_input_tokens_seen": 29073380, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.046875, "step": 1367, "time_per_iteration": 2.467405080795288 }, { "auxiliary_loss_clip": 0.01145546, "auxiliary_loss_mlp": 0.01059798, "balance_loss_clip": 1.03116357, "balance_loss_mlp": 1.03625274, "epoch": 0.08224860964978205, "flos": 17856886245120.0, "grad_norm": 2.036747963087789, "language_loss": 0.82997632, "learning_rate": 3.933777542689918e-06, "loss": 0.85202968, "num_input_tokens_seen": 29091330, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.09375, "step": 1368, "time_per_iteration": 2.5678632259368896 }, { "auxiliary_loss_clip": 0.01133686, "auxiliary_loss_mlp": 0.0104746, "balance_loss_clip": 1.02059031, "balance_loss_mlp": 1.03601241, "epoch": 0.08230873290245003, "flos": 25226388005760.0, "grad_norm": 1.7779988400973337, "language_loss": 0.81281292, "learning_rate": 3.933681088619715e-06, "loss": 0.83462441, "num_input_tokens_seen": 29110375, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.9765625, "step": 1369, "time_per_iteration": 2.535598039627075 }, { "auxiliary_loss_clip": 0.01136147, "auxiliary_loss_mlp": 0.01048409, "balance_loss_clip": 1.02279139, "balance_loss_mlp": 1.03621101, "epoch": 0.08236885615511799, "flos": 31758567696000.0, "grad_norm": 2.113249725053309, "language_loss": 0.74624491, "learning_rate": 3.933584565541407e-06, "loss": 0.76809049, "num_input_tokens_seen": 29129395, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.0, "step": 1370, "time_per_iteration": 2.606149196624756 }, { "auxiliary_loss_clip": 0.01141181, "auxiliary_loss_mlp": 0.01055651, "balance_loss_clip": 1.02685022, "balance_loss_mlp": 1.03535104, "epoch": 0.08242897940778596, "flos": 23184702074880.0, "grad_norm": 1.5566014351480937, "language_loss": 0.74512672, "learning_rate": 3.9334879734584405e-06, "loss": 0.76709503, "num_input_tokens_seen": 29148650, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.0625, "step": 1371, "time_per_iteration": 2.5078554153442383 }, { "auxiliary_loss_clip": 0.01137903, "auxiliary_loss_mlp": 0.01052755, "balance_loss_clip": 1.02305984, "balance_loss_mlp": 1.03381038, "epoch": 0.08248910266045394, "flos": 34490172470400.0, "grad_norm": 2.0681939965691374, "language_loss": 0.71125972, "learning_rate": 3.933391312374262e-06, "loss": 0.73316634, "num_input_tokens_seen": 29170785, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.046875, "step": 1372, "time_per_iteration": 2.66995906829834 }, { "auxiliary_loss_clip": 0.01141069, "auxiliary_loss_mlp": 0.01055284, "balance_loss_clip": 1.02529144, "balance_loss_mlp": 1.03453314, "epoch": 0.0825492259131219, "flos": 13435590533760.0, "grad_norm": 3.663715416242882, "language_loss": 0.87991744, "learning_rate": 3.93329458229232e-06, "loss": 0.90188098, "num_input_tokens_seen": 29185210, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.0625, "step": 1373, "time_per_iteration": 2.432610034942627 }, { "auxiliary_loss_clip": 0.01136937, "auxiliary_loss_mlp": 0.01053465, "balance_loss_clip": 1.02511716, "balance_loss_mlp": 1.03353488, "epoch": 0.08260934916578987, "flos": 25811252968320.0, "grad_norm": 1.8320037522503072, "language_loss": 0.82148111, "learning_rate": 3.933197783216068e-06, "loss": 0.8433851, "num_input_tokens_seen": 29205210, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.03125, "step": 1374, "time_per_iteration": 2.4978749752044678 }, { "auxiliary_loss_clip": 0.01037488, "auxiliary_loss_mlp": 0.01014386, "balance_loss_clip": 1.00916481, "balance_loss_mlp": 1.00536513, "epoch": 0.08266947241845783, "flos": 63456909563520.0, "grad_norm": 0.8173504943743701, "language_loss": 0.60599476, "learning_rate": 3.93310091514896e-06, "loss": 0.62651354, "num_input_tokens_seen": 29265350, "router_z_loss_clip": 0.05224609, "router_z_loss_mlp": 0.3203125, "step": 1375, "time_per_iteration": 3.042445182800293 }, { "auxiliary_loss_clip": 0.01035876, "auxiliary_loss_mlp": 0.01008424, "balance_loss_clip": 1.00348854, "balance_loss_mlp": 1.00414395, "epoch": 0.08272959567112581, "flos": 69990346062720.0, "grad_norm": 0.9075145685559574, "language_loss": 0.62212205, "learning_rate": 3.933003978094452e-06, "loss": 0.64256501, "num_input_tokens_seen": 29321475, "router_z_loss_clip": 0.04931641, "router_z_loss_mlp": 0.31640625, "step": 1376, "time_per_iteration": 3.0153872966766357 }, { "auxiliary_loss_clip": 0.01142638, "auxiliary_loss_mlp": 0.01054963, "balance_loss_clip": 1.02731788, "balance_loss_mlp": 1.03678632, "epoch": 0.08278971892379378, "flos": 20412144408960.0, "grad_norm": 1.6758916856217034, "language_loss": 0.82464159, "learning_rate": 3.9329069720560045e-06, "loss": 0.84661758, "num_input_tokens_seen": 29341405, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.0625, "step": 1377, "time_per_iteration": 2.5013442039489746 }, { "auxiliary_loss_clip": 0.01137221, "auxiliary_loss_mlp": 0.01052512, "balance_loss_clip": 1.02425957, "balance_loss_mlp": 1.03591537, "epoch": 0.08284984217646174, "flos": 26249028906240.0, "grad_norm": 1.8833560528177287, "language_loss": 0.84713018, "learning_rate": 3.932809897037079e-06, "loss": 0.8690275, "num_input_tokens_seen": 29361955, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.015625, "step": 1378, "time_per_iteration": 2.4867005348205566 }, { "auxiliary_loss_clip": 0.01138837, "auxiliary_loss_mlp": 0.01053253, "balance_loss_clip": 1.02495241, "balance_loss_mlp": 1.03472888, "epoch": 0.08290996542912972, "flos": 27193569361920.0, "grad_norm": 2.1981360833435644, "language_loss": 0.87588495, "learning_rate": 3.932712753041141e-06, "loss": 0.89780581, "num_input_tokens_seen": 29382395, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.0390625, "step": 1379, "time_per_iteration": 2.477638006210327 }, { "auxiliary_loss_clip": 0.01137617, "auxiliary_loss_mlp": 0.01054187, "balance_loss_clip": 1.02743649, "balance_loss_mlp": 1.03673005, "epoch": 0.08297008868179769, "flos": 38616661728000.0, "grad_norm": 2.1679296386762332, "language_loss": 0.7849893, "learning_rate": 3.932615540071656e-06, "loss": 0.80690736, "num_input_tokens_seen": 29404460, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 1.015625, "step": 1380, "time_per_iteration": 2.553020477294922 }, { "auxiliary_loss_clip": 0.01137195, "auxiliary_loss_mlp": 0.01059167, "balance_loss_clip": 1.03142667, "balance_loss_mlp": 1.03819525, "epoch": 0.08303021193446565, "flos": 19973705155200.0, "grad_norm": 2.502140765456767, "language_loss": 0.85779071, "learning_rate": 3.932518258132094e-06, "loss": 0.8797543, "num_input_tokens_seen": 29422675, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.98828125, "step": 1381, "time_per_iteration": 2.4220502376556396 }, { "auxiliary_loss_clip": 0.01146824, "auxiliary_loss_mlp": 0.01056244, "balance_loss_clip": 1.02714479, "balance_loss_mlp": 1.03916287, "epoch": 0.08309033518713363, "flos": 13661792432640.0, "grad_norm": 2.8855093131695493, "language_loss": 0.88018179, "learning_rate": 3.932420907225926e-06, "loss": 0.9022125, "num_input_tokens_seen": 29439840, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.078125, "step": 1382, "time_per_iteration": 2.3828964233398438 }, { "auxiliary_loss_clip": 0.01138958, "auxiliary_loss_mlp": 0.01054701, "balance_loss_clip": 1.02839184, "balance_loss_mlp": 1.03570044, "epoch": 0.0831504584398016, "flos": 17967560855040.0, "grad_norm": 2.632102141344648, "language_loss": 0.77463621, "learning_rate": 3.932323487356626e-06, "loss": 0.7965728, "num_input_tokens_seen": 29457360, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.03125, "step": 1383, "time_per_iteration": 2.403245210647583 }, { "auxiliary_loss_clip": 0.01142298, "auxiliary_loss_mlp": 0.01054448, "balance_loss_clip": 1.02700639, "balance_loss_mlp": 1.03616834, "epoch": 0.08321058169246956, "flos": 22600290960000.0, "grad_norm": 6.694357317480596, "language_loss": 0.82948864, "learning_rate": 3.932225998527672e-06, "loss": 0.85145605, "num_input_tokens_seen": 29477040, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.0625, "step": 1384, "time_per_iteration": 2.4460690021514893 }, { "auxiliary_loss_clip": 0.01147629, "auxiliary_loss_mlp": 0.01055719, "balance_loss_clip": 1.02738309, "balance_loss_mlp": 1.03878617, "epoch": 0.08327070494513754, "flos": 22849501311360.0, "grad_norm": 2.7198915303661058, "language_loss": 0.85049307, "learning_rate": 3.932128440742542e-06, "loss": 0.87252659, "num_input_tokens_seen": 29492010, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.0859375, "step": 1385, "time_per_iteration": 2.4074478149414062 }, { "auxiliary_loss_clip": 0.01144683, "auxiliary_loss_mlp": 0.01051475, "balance_loss_clip": 1.02263844, "balance_loss_mlp": 1.03906059, "epoch": 0.0833308281978055, "flos": 22781909185920.0, "grad_norm": 1.7272321262773471, "language_loss": 0.68542445, "learning_rate": 3.932030814004719e-06, "loss": 0.70738602, "num_input_tokens_seen": 29511850, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.0546875, "step": 1386, "time_per_iteration": 2.489827871322632 }, { "auxiliary_loss_clip": 0.01138937, "auxiliary_loss_mlp": 0.0105088, "balance_loss_clip": 1.02410579, "balance_loss_mlp": 1.03381312, "epoch": 0.08339095145047347, "flos": 20811585807360.0, "grad_norm": 1.6662190524934888, "language_loss": 0.81894517, "learning_rate": 3.9319331183176844e-06, "loss": 0.84084338, "num_input_tokens_seen": 29531415, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 1.046875, "step": 1387, "time_per_iteration": 2.442065715789795 }, { "auxiliary_loss_clip": 0.0114151, "auxiliary_loss_mlp": 0.01063807, "balance_loss_clip": 1.03293192, "balance_loss_mlp": 1.03462207, "epoch": 0.08345107470314143, "flos": 18514335657600.0, "grad_norm": 1.9618631684366505, "language_loss": 0.77150124, "learning_rate": 3.931835353684927e-06, "loss": 0.79355443, "num_input_tokens_seen": 29549525, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.0703125, "step": 1388, "time_per_iteration": 2.442744731903076 }, { "auxiliary_loss_clip": 0.01136028, "auxiliary_loss_mlp": 0.01059897, "balance_loss_clip": 1.03164482, "balance_loss_mlp": 1.03585863, "epoch": 0.08351119795580941, "flos": 18806558670720.0, "grad_norm": 1.9977385433797352, "language_loss": 0.78928244, "learning_rate": 3.931737520109935e-06, "loss": 0.81124169, "num_input_tokens_seen": 29568705, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.0, "step": 1389, "time_per_iteration": 2.403454542160034 }, { "auxiliary_loss_clip": 0.01140771, "auxiliary_loss_mlp": 0.01045638, "balance_loss_clip": 1.01771963, "balance_loss_mlp": 1.03638148, "epoch": 0.08357132120847738, "flos": 18440843512320.0, "grad_norm": 2.5662322532793325, "language_loss": 0.87396991, "learning_rate": 3.931639617596201e-06, "loss": 0.89583397, "num_input_tokens_seen": 29585855, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.046875, "step": 1390, "time_per_iteration": 2.444549798965454 }, { "auxiliary_loss_clip": 0.01136063, "auxiliary_loss_mlp": 0.01063131, "balance_loss_clip": 1.03559399, "balance_loss_mlp": 1.03334212, "epoch": 0.08363144446114534, "flos": 25921124616960.0, "grad_norm": 2.3827728135287236, "language_loss": 0.86620837, "learning_rate": 3.931541646147217e-06, "loss": 0.88820034, "num_input_tokens_seen": 29607280, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.03125, "step": 1391, "time_per_iteration": 2.4772896766662598 }, { "auxiliary_loss_clip": 0.01147064, "auxiliary_loss_mlp": 0.01066716, "balance_loss_clip": 1.03882098, "balance_loss_mlp": 1.03849733, "epoch": 0.08369156771381332, "flos": 18040319861760.0, "grad_norm": 2.5776007911349925, "language_loss": 0.87413985, "learning_rate": 3.93144360576648e-06, "loss": 0.89627767, "num_input_tokens_seen": 29624130, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.0859375, "step": 1392, "time_per_iteration": 2.43986439704895 }, { "auxiliary_loss_clip": 0.0113886, "auxiliary_loss_mlp": 0.01055026, "balance_loss_clip": 1.02673769, "balance_loss_mlp": 1.03480148, "epoch": 0.08375169096648129, "flos": 22673992573440.0, "grad_norm": 2.5201146235582197, "language_loss": 0.79845703, "learning_rate": 3.931345496457489e-06, "loss": 0.82039583, "num_input_tokens_seen": 29643210, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.0390625, "step": 1393, "time_per_iteration": 2.4255776405334473 }, { "auxiliary_loss_clip": 0.01137911, "auxiliary_loss_mlp": 0.01047337, "balance_loss_clip": 1.02095628, "balance_loss_mlp": 1.03682518, "epoch": 0.08381181421914925, "flos": 26102044615680.0, "grad_norm": 3.8426116391483442, "language_loss": 0.84546328, "learning_rate": 3.931247318223746e-06, "loss": 0.86731571, "num_input_tokens_seen": 29663920, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.0078125, "step": 1394, "time_per_iteration": 2.4559414386749268 }, { "auxiliary_loss_clip": 0.01141432, "auxiliary_loss_mlp": 0.0104905, "balance_loss_clip": 1.0208931, "balance_loss_mlp": 1.0367496, "epoch": 0.08387193747181723, "flos": 20628780595200.0, "grad_norm": 2.1271812036602222, "language_loss": 0.82844597, "learning_rate": 3.931149071068753e-06, "loss": 0.85035086, "num_input_tokens_seen": 29683825, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.046875, "step": 1395, "time_per_iteration": 2.403179407119751 }, { "auxiliary_loss_clip": 0.01141043, "auxiliary_loss_mlp": 0.01050477, "balance_loss_clip": 1.02043593, "balance_loss_mlp": 1.03645396, "epoch": 0.0839320607244852, "flos": 13442363337600.0, "grad_norm": 2.805372604291138, "language_loss": 0.82337093, "learning_rate": 3.931050754996018e-06, "loss": 0.84528613, "num_input_tokens_seen": 29698775, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.046875, "step": 1396, "time_per_iteration": 2.3986716270446777 }, { "auxiliary_loss_clip": 0.01139023, "auxiliary_loss_mlp": 0.01051594, "balance_loss_clip": 1.0220654, "balance_loss_mlp": 1.03866041, "epoch": 0.08399218397715316, "flos": 23476122126720.0, "grad_norm": 1.9699496155025322, "language_loss": 0.76609969, "learning_rate": 3.930952370009048e-06, "loss": 0.78800583, "num_input_tokens_seen": 29719430, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.0, "step": 1397, "time_per_iteration": 3.893310308456421 }, { "auxiliary_loss_clip": 0.01136244, "auxiliary_loss_mlp": 0.01046049, "balance_loss_clip": 1.01685452, "balance_loss_mlp": 1.03420091, "epoch": 0.08405230722982113, "flos": 25919553605760.0, "grad_norm": 2.245557912887348, "language_loss": 0.7817446, "learning_rate": 3.930853916111355e-06, "loss": 0.80356753, "num_input_tokens_seen": 29739685, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.0234375, "step": 1398, "time_per_iteration": 3.903414726257324 }, { "auxiliary_loss_clip": 0.01131837, "auxiliary_loss_mlp": 0.01047987, "balance_loss_clip": 1.0217495, "balance_loss_mlp": 1.03283024, "epoch": 0.0841124304824891, "flos": 17966478602880.0, "grad_norm": 2.612612922286341, "language_loss": 0.95172715, "learning_rate": 3.930755393306453e-06, "loss": 0.9735254, "num_input_tokens_seen": 29756165, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.98828125, "step": 1399, "time_per_iteration": 2.4221866130828857 }, { "auxiliary_loss_clip": 0.01139385, "auxiliary_loss_mlp": 0.01057325, "balance_loss_clip": 1.02733231, "balance_loss_mlp": 1.03419042, "epoch": 0.08417255373515707, "flos": 25628482667520.0, "grad_norm": 1.9415604543083347, "language_loss": 0.81517625, "learning_rate": 3.930656801597857e-06, "loss": 0.83714336, "num_input_tokens_seen": 29776425, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.0546875, "step": 1400, "time_per_iteration": 5.18965220451355 }, { "auxiliary_loss_clip": 0.01136377, "auxiliary_loss_mlp": 0.01053098, "balance_loss_clip": 1.0243212, "balance_loss_mlp": 1.0334307, "epoch": 0.08423267698782504, "flos": 26248540147200.0, "grad_norm": 3.0763122953915043, "language_loss": 0.86442995, "learning_rate": 3.930558140989087e-06, "loss": 0.88632476, "num_input_tokens_seen": 29796440, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.03125, "step": 1401, "time_per_iteration": 2.4589450359344482 }, { "auxiliary_loss_clip": 0.01140493, "auxiliary_loss_mlp": 0.01053125, "balance_loss_clip": 1.02307224, "balance_loss_mlp": 1.03402793, "epoch": 0.08429280024049302, "flos": 20118699498240.0, "grad_norm": 2.3133765135270075, "language_loss": 0.87033337, "learning_rate": 3.930459411483662e-06, "loss": 0.89226949, "num_input_tokens_seen": 29814755, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.0625, "step": 1402, "time_per_iteration": 2.4530887603759766 }, { "auxiliary_loss_clip": 0.01134145, "auxiliary_loss_mlp": 0.01047832, "balance_loss_clip": 1.02120042, "balance_loss_mlp": 1.03132677, "epoch": 0.08435292349316098, "flos": 42922849086720.0, "grad_norm": 2.048879929905967, "language_loss": 0.8895582, "learning_rate": 3.930360613085106e-06, "loss": 0.91137803, "num_input_tokens_seen": 29834785, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.03125, "step": 1403, "time_per_iteration": 2.6017262935638428 }, { "auxiliary_loss_clip": 0.01139954, "auxiliary_loss_mlp": 0.01052537, "balance_loss_clip": 1.02278185, "balance_loss_mlp": 1.0341984, "epoch": 0.08441304674582895, "flos": 22856169381120.0, "grad_norm": 2.3078835344609447, "language_loss": 0.80272245, "learning_rate": 3.930261745796945e-06, "loss": 0.82464731, "num_input_tokens_seen": 29854695, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.0625, "step": 1404, "time_per_iteration": 2.4601595401763916 }, { "auxiliary_loss_clip": 0.01142983, "auxiliary_loss_mlp": 0.01063266, "balance_loss_clip": 1.03136539, "balance_loss_mlp": 1.03684366, "epoch": 0.08447316999849692, "flos": 18696512465280.0, "grad_norm": 1.9814480155180556, "language_loss": 0.83600795, "learning_rate": 3.930162809622709e-06, "loss": 0.85807049, "num_input_tokens_seen": 29872180, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 1.0625, "step": 1405, "time_per_iteration": 2.4039623737335205 }, { "auxiliary_loss_clip": 0.01137342, "auxiliary_loss_mlp": 0.0105122, "balance_loss_clip": 1.022228, "balance_loss_mlp": 1.0332588, "epoch": 0.08453329325116489, "flos": 25482790097280.0, "grad_norm": 1.6255358588896107, "language_loss": 0.80443799, "learning_rate": 3.930063804565927e-06, "loss": 0.82632363, "num_input_tokens_seen": 29893205, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.0390625, "step": 1406, "time_per_iteration": 2.510904550552368 }, { "auxiliary_loss_clip": 0.01140187, "auxiliary_loss_mlp": 0.01056026, "balance_loss_clip": 1.0283339, "balance_loss_mlp": 1.03640819, "epoch": 0.08459341650383286, "flos": 20919083483520.0, "grad_norm": 1.957000793352056, "language_loss": 0.79425609, "learning_rate": 3.929964730630132e-06, "loss": 0.81621814, "num_input_tokens_seen": 29911970, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.0390625, "step": 1407, "time_per_iteration": 2.423840045928955 }, { "auxiliary_loss_clip": 0.01135099, "auxiliary_loss_mlp": 0.01050036, "balance_loss_clip": 1.02234411, "balance_loss_mlp": 1.03416073, "epoch": 0.08465353975650082, "flos": 13042223712000.0, "grad_norm": 2.3275697224793697, "language_loss": 0.91585648, "learning_rate": 3.92986558781886e-06, "loss": 0.9377079, "num_input_tokens_seen": 29929925, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.0078125, "step": 1408, "time_per_iteration": 2.4315226078033447 }, { "auxiliary_loss_clip": 0.01043673, "auxiliary_loss_mlp": 0.01023297, "balance_loss_clip": 1.01826644, "balance_loss_mlp": 1.01066768, "epoch": 0.0847136630091688, "flos": 60874174293120.0, "grad_norm": 0.8792341838331387, "language_loss": 0.61765254, "learning_rate": 3.92976637613565e-06, "loss": 0.63832223, "num_input_tokens_seen": 29985950, "router_z_loss_clip": 0.05029297, "router_z_loss_mlp": 0.33007812, "step": 1409, "time_per_iteration": 3.1208980083465576 }, { "auxiliary_loss_clip": 0.01131074, "auxiliary_loss_mlp": 0.01055388, "balance_loss_clip": 1.02793384, "balance_loss_mlp": 1.03469133, "epoch": 0.08477378626183676, "flos": 22045661101440.0, "grad_norm": 1.6652926113525195, "language_loss": 0.86648887, "learning_rate": 3.9296670955840415e-06, "loss": 0.88835347, "num_input_tokens_seen": 30004330, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.96484375, "step": 1410, "time_per_iteration": 2.4426138401031494 }, { "auxiliary_loss_clip": 0.01136838, "auxiliary_loss_mlp": 0.01047834, "balance_loss_clip": 1.01812696, "balance_loss_mlp": 1.03348505, "epoch": 0.08483390951450473, "flos": 16689146267520.0, "grad_norm": 2.071857028368419, "language_loss": 0.74074405, "learning_rate": 3.929567746167578e-06, "loss": 0.76259077, "num_input_tokens_seen": 30022555, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.03125, "step": 1411, "time_per_iteration": 2.4424655437469482 }, { "auxiliary_loss_clip": 0.01036848, "auxiliary_loss_mlp": 0.01007644, "balance_loss_clip": 1.00278056, "balance_loss_mlp": 1.00472724, "epoch": 0.08489403276717271, "flos": 51581341710720.0, "grad_norm": 0.9068689782583981, "language_loss": 0.56724936, "learning_rate": 3.929468327889805e-06, "loss": 0.58769429, "num_input_tokens_seen": 30077220, "router_z_loss_clip": 0.04858398, "router_z_loss_mlp": 0.3203125, "step": 1412, "time_per_iteration": 3.0075435638427734 }, { "auxiliary_loss_clip": 0.01133906, "auxiliary_loss_mlp": 0.01054027, "balance_loss_clip": 1.02596474, "balance_loss_mlp": 1.03316736, "epoch": 0.08495415601984067, "flos": 17091380574720.0, "grad_norm": 2.4888670092824627, "language_loss": 0.88898432, "learning_rate": 3.9293688407542715e-06, "loss": 0.91086364, "num_input_tokens_seen": 30094600, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.0078125, "step": 1413, "time_per_iteration": 2.515660524368286 }, { "auxiliary_loss_clip": 0.01138069, "auxiliary_loss_mlp": 0.01049608, "balance_loss_clip": 1.02165365, "balance_loss_mlp": 1.03670883, "epoch": 0.08501427927250864, "flos": 23147310142080.0, "grad_norm": 1.928976151337458, "language_loss": 0.88079464, "learning_rate": 3.929269284764526e-06, "loss": 0.9026714, "num_input_tokens_seen": 30114475, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.015625, "step": 1414, "time_per_iteration": 2.4387221336364746 }, { "auxiliary_loss_clip": 0.01138837, "auxiliary_loss_mlp": 0.01055666, "balance_loss_clip": 1.02861762, "balance_loss_mlp": 1.03549552, "epoch": 0.08507440252517662, "flos": 19062437091840.0, "grad_norm": 1.8104022752795743, "language_loss": 0.77125359, "learning_rate": 3.929169659924123e-06, "loss": 0.79319859, "num_input_tokens_seen": 30133350, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.03125, "step": 1415, "time_per_iteration": 2.4585282802581787 }, { "auxiliary_loss_clip": 0.01136626, "auxiliary_loss_mlp": 0.01053604, "balance_loss_clip": 1.02770007, "balance_loss_mlp": 1.03540778, "epoch": 0.08513452577784458, "flos": 60180137775360.0, "grad_norm": 1.7518766502615744, "language_loss": 0.70400184, "learning_rate": 3.929069966236617e-06, "loss": 0.72590417, "num_input_tokens_seen": 30159005, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.015625, "step": 1416, "time_per_iteration": 2.7807602882385254 }, { "auxiliary_loss_clip": 0.01142408, "auxiliary_loss_mlp": 0.01058259, "balance_loss_clip": 1.02874279, "balance_loss_mlp": 1.03745627, "epoch": 0.08519464903051255, "flos": 27307246348800.0, "grad_norm": 2.068740206450198, "language_loss": 0.74673724, "learning_rate": 3.928970203705565e-06, "loss": 0.76874387, "num_input_tokens_seen": 30179450, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.046875, "step": 1417, "time_per_iteration": 2.503241539001465 }, { "auxiliary_loss_clip": 0.01135854, "auxiliary_loss_mlp": 0.01048919, "balance_loss_clip": 1.02104759, "balance_loss_mlp": 1.03411698, "epoch": 0.08525477228318051, "flos": 20265404497920.0, "grad_norm": 2.8020629614021364, "language_loss": 0.82518953, "learning_rate": 3.92887037233453e-06, "loss": 0.84703726, "num_input_tokens_seen": 30197235, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.015625, "step": 1418, "time_per_iteration": 2.4216010570526123 }, { "auxiliary_loss_clip": 0.01036904, "auxiliary_loss_mlp": 0.01013136, "balance_loss_clip": 1.00758064, "balance_loss_mlp": 1.00450683, "epoch": 0.08531489553584849, "flos": 67611923268480.0, "grad_norm": 0.892372631416078, "language_loss": 0.56662297, "learning_rate": 3.928770472127073e-06, "loss": 0.58712339, "num_input_tokens_seen": 30257410, "router_z_loss_clip": 0.05566406, "router_z_loss_mlp": 0.32421875, "step": 1419, "time_per_iteration": 3.0361785888671875 }, { "auxiliary_loss_clip": 0.01135059, "auxiliary_loss_mlp": 0.010583, "balance_loss_clip": 1.03101301, "balance_loss_mlp": 1.03346896, "epoch": 0.08537501878851646, "flos": 27525732837120.0, "grad_norm": 2.225891915285972, "language_loss": 0.69978249, "learning_rate": 3.928670503086758e-06, "loss": 0.72171611, "num_input_tokens_seen": 30277865, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.015625, "step": 1420, "time_per_iteration": 2.5303280353546143 }, { "auxiliary_loss_clip": 0.011341, "auxiliary_loss_mlp": 0.01041267, "balance_loss_clip": 1.01250148, "balance_loss_mlp": 1.0326556, "epoch": 0.08543514204118442, "flos": 22783131083520.0, "grad_norm": 1.5346284285593206, "language_loss": 0.88313144, "learning_rate": 3.9285704652171545e-06, "loss": 0.90488505, "num_input_tokens_seen": 30298545, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.015625, "step": 1421, "time_per_iteration": 2.4855990409851074 }, { "auxiliary_loss_clip": 0.01035216, "auxiliary_loss_mlp": 0.01009874, "balance_loss_clip": 1.00470078, "balance_loss_mlp": 1.00329804, "epoch": 0.0854952652938524, "flos": 60987362520960.0, "grad_norm": 0.8049145610308904, "language_loss": 0.63468266, "learning_rate": 3.9284703585218324e-06, "loss": 0.65513355, "num_input_tokens_seen": 30361725, "router_z_loss_clip": 0.05175781, "router_z_loss_mlp": 0.3203125, "step": 1422, "time_per_iteration": 3.057037115097046 }, { "auxiliary_loss_clip": 0.01132589, "auxiliary_loss_mlp": 0.01055404, "balance_loss_clip": 1.0286057, "balance_loss_mlp": 1.03614104, "epoch": 0.08555538854652037, "flos": 28036791452160.0, "grad_norm": 3.313427635387682, "language_loss": 0.83097607, "learning_rate": 3.928370183004363e-06, "loss": 0.85285604, "num_input_tokens_seen": 30382180, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.96484375, "step": 1423, "time_per_iteration": 2.525609016418457 }, { "auxiliary_loss_clip": 0.01138477, "auxiliary_loss_mlp": 0.01063673, "balance_loss_clip": 1.03676784, "balance_loss_mlp": 1.03663898, "epoch": 0.08561551179918833, "flos": 23508277355520.0, "grad_norm": 1.6560125375036239, "language_loss": 0.75101602, "learning_rate": 3.9282699386683236e-06, "loss": 0.77303749, "num_input_tokens_seen": 30402980, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.015625, "step": 1424, "time_per_iteration": 2.5153520107269287 }, { "auxiliary_loss_clip": 0.01137275, "auxiliary_loss_mlp": 0.01058406, "balance_loss_clip": 1.03109503, "balance_loss_mlp": 1.0375545, "epoch": 0.08567563505185631, "flos": 17926084293120.0, "grad_norm": 1.8755866914873893, "language_loss": 0.76020384, "learning_rate": 3.928169625517289e-06, "loss": 0.78216064, "num_input_tokens_seen": 30420800, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.99609375, "step": 1425, "time_per_iteration": 2.46828293800354 }, { "auxiliary_loss_clip": 0.01134093, "auxiliary_loss_mlp": 0.01046632, "balance_loss_clip": 1.02071571, "balance_loss_mlp": 1.03487504, "epoch": 0.08573575830452428, "flos": 19718490049920.0, "grad_norm": 2.9193753758221637, "language_loss": 0.93008298, "learning_rate": 3.9280692435548405e-06, "loss": 0.95189023, "num_input_tokens_seen": 30439620, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.9921875, "step": 1426, "time_per_iteration": 2.4237146377563477 }, { "auxiliary_loss_clip": 0.01141535, "auxiliary_loss_mlp": 0.01061609, "balance_loss_clip": 1.03209257, "balance_loss_mlp": 1.03872645, "epoch": 0.08579588155719224, "flos": 17930587858560.0, "grad_norm": 2.0509856314306787, "language_loss": 0.75465858, "learning_rate": 3.927968792784561e-06, "loss": 0.77669007, "num_input_tokens_seen": 30457300, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.03125, "step": 1427, "time_per_iteration": 2.5031518936157227 }, { "auxiliary_loss_clip": 0.0113477, "auxiliary_loss_mlp": 0.01049683, "balance_loss_clip": 1.02349269, "balance_loss_mlp": 1.03487051, "epoch": 0.08585600480986022, "flos": 16032429993600.0, "grad_norm": 2.3071386430294982, "language_loss": 0.82328194, "learning_rate": 3.927868273210033e-06, "loss": 0.84512639, "num_input_tokens_seen": 30471580, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.0, "step": 1428, "time_per_iteration": 2.5303401947021484 }, { "auxiliary_loss_clip": 0.01142845, "auxiliary_loss_mlp": 0.0106289, "balance_loss_clip": 1.03399324, "balance_loss_mlp": 1.03674901, "epoch": 0.08591612806252819, "flos": 28656185616000.0, "grad_norm": 2.24419618106378, "language_loss": 0.79911095, "learning_rate": 3.927767684834847e-06, "loss": 0.8211683, "num_input_tokens_seen": 30492720, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.0625, "step": 1429, "time_per_iteration": 2.5428974628448486 }, { "auxiliary_loss_clip": 0.01141666, "auxiliary_loss_mlp": 0.01056747, "balance_loss_clip": 1.02868533, "balance_loss_mlp": 1.03754735, "epoch": 0.08597625131519615, "flos": 20958081338880.0, "grad_norm": 2.76991814960215, "language_loss": 0.88487703, "learning_rate": 3.9276670276625894e-06, "loss": 0.90686119, "num_input_tokens_seen": 30509535, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.0390625, "step": 1430, "time_per_iteration": 2.4304347038269043 }, { "auxiliary_loss_clip": 0.01137127, "auxiliary_loss_mlp": 0.01052006, "balance_loss_clip": 1.02481472, "balance_loss_mlp": 1.03735805, "epoch": 0.08603637456786412, "flos": 23255296577280.0, "grad_norm": 1.6513236082284355, "language_loss": 0.81535912, "learning_rate": 3.927566301696856e-06, "loss": 0.83725047, "num_input_tokens_seen": 30529490, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.0, "step": 1431, "time_per_iteration": 2.4605906009674072 }, { "auxiliary_loss_clip": 0.01136667, "auxiliary_loss_mlp": 0.01054383, "balance_loss_clip": 1.02739429, "balance_loss_mlp": 1.03375912, "epoch": 0.0860964978205321, "flos": 28692914232960.0, "grad_norm": 1.9114593628809293, "language_loss": 0.77429157, "learning_rate": 3.927465506941238e-06, "loss": 0.79620206, "num_input_tokens_seen": 30550205, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.03125, "step": 1432, "time_per_iteration": 2.466153860092163 }, { "auxiliary_loss_clip": 0.01136558, "auxiliary_loss_mlp": 0.01058598, "balance_loss_clip": 1.02967751, "balance_loss_mlp": 1.03431463, "epoch": 0.08615662107320006, "flos": 19317372906240.0, "grad_norm": 2.704543968863709, "language_loss": 0.72969025, "learning_rate": 3.927364643399335e-06, "loss": 0.75164181, "num_input_tokens_seen": 30568830, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.0234375, "step": 1433, "time_per_iteration": 2.426859140396118 }, { "auxiliary_loss_clip": 0.0114178, "auxiliary_loss_mlp": 0.01059604, "balance_loss_clip": 1.02967048, "balance_loss_mlp": 1.03753674, "epoch": 0.08621674432586802, "flos": 15850776856320.0, "grad_norm": 2.5010258393633356, "language_loss": 0.85818481, "learning_rate": 3.927263711074745e-06, "loss": 0.8801986, "num_input_tokens_seen": 30585730, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.0390625, "step": 1434, "time_per_iteration": 2.3832969665527344 }, { "auxiliary_loss_clip": 0.0113731, "auxiliary_loss_mlp": 0.01055979, "balance_loss_clip": 1.02808404, "balance_loss_mlp": 1.03510058, "epoch": 0.086276867578536, "flos": 14099777838720.0, "grad_norm": 2.47929626062069, "language_loss": 0.78560674, "learning_rate": 3.927162709971072e-06, "loss": 0.8075397, "num_input_tokens_seen": 30603180, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.0234375, "step": 1435, "time_per_iteration": 2.4245572090148926 }, { "auxiliary_loss_clip": 0.01036822, "auxiliary_loss_mlp": 0.01042827, "balance_loss_clip": 1.03898871, "balance_loss_mlp": 1.00600958, "epoch": 0.08633699083120397, "flos": 70181250710400.0, "grad_norm": 0.923907410816164, "language_loss": 0.57990175, "learning_rate": 3.927061640091918e-06, "loss": 0.60069823, "num_input_tokens_seen": 30668895, "router_z_loss_clip": 0.03833008, "router_z_loss_mlp": 0.30859375, "step": 1436, "time_per_iteration": 4.62123966217041 }, { "auxiliary_loss_clip": 0.01136609, "auxiliary_loss_mlp": 0.01054357, "balance_loss_clip": 1.02512634, "balance_loss_mlp": 1.0350647, "epoch": 0.08639711408387193, "flos": 30297592275840.0, "grad_norm": 2.785802479640344, "language_loss": 0.68792832, "learning_rate": 3.926960501440891e-06, "loss": 0.70983791, "num_input_tokens_seen": 30688955, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.015625, "step": 1437, "time_per_iteration": 2.506444215774536 }, { "auxiliary_loss_clip": 0.01136806, "auxiliary_loss_mlp": 0.0104706, "balance_loss_clip": 1.01931965, "balance_loss_mlp": 1.0348177, "epoch": 0.08645723733653991, "flos": 20296791676800.0, "grad_norm": 2.169010760070846, "language_loss": 0.72614551, "learning_rate": 3.9268592940216014e-06, "loss": 0.74798417, "num_input_tokens_seen": 30706095, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.015625, "step": 1438, "time_per_iteration": 3.8132054805755615 }, { "auxiliary_loss_clip": 0.01132794, "auxiliary_loss_mlp": 0.0104806, "balance_loss_clip": 1.01983142, "balance_loss_mlp": 1.03471184, "epoch": 0.08651736058920788, "flos": 32889195031680.0, "grad_norm": 1.600592663775302, "language_loss": 0.64091539, "learning_rate": 3.9267580178376596e-06, "loss": 0.6627239, "num_input_tokens_seen": 30729025, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.98046875, "step": 1439, "time_per_iteration": 2.560499906539917 }, { "auxiliary_loss_clip": 0.01137702, "auxiliary_loss_mlp": 0.01049448, "balance_loss_clip": 1.02102852, "balance_loss_mlp": 1.03593493, "epoch": 0.08657748384187584, "flos": 22636286438400.0, "grad_norm": 2.5636767582097706, "language_loss": 0.87194371, "learning_rate": 3.92665667289268e-06, "loss": 0.89381528, "num_input_tokens_seen": 30746155, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.015625, "step": 1440, "time_per_iteration": 5.246348857879639 }, { "auxiliary_loss_clip": 0.01142193, "auxiliary_loss_mlp": 0.01058538, "balance_loss_clip": 1.02762711, "balance_loss_mlp": 1.03654242, "epoch": 0.08663760709454381, "flos": 23657286504960.0, "grad_norm": 3.2939192965722217, "language_loss": 0.8352201, "learning_rate": 3.92655525919028e-06, "loss": 0.85722744, "num_input_tokens_seen": 30761410, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.0546875, "step": 1441, "time_per_iteration": 2.458915948867798 }, { "auxiliary_loss_clip": 0.01033196, "auxiliary_loss_mlp": 0.01004597, "balance_loss_clip": 1.00054348, "balance_loss_mlp": 1.00218558, "epoch": 0.08669773034721179, "flos": 62683688482560.0, "grad_norm": 0.8420296727267951, "language_loss": 0.60429287, "learning_rate": 3.926453776734078e-06, "loss": 0.62467074, "num_input_tokens_seen": 30823010, "router_z_loss_clip": 0.04052734, "router_z_loss_mlp": 0.31054688, "step": 1442, "time_per_iteration": 3.161848783493042 }, { "auxiliary_loss_clip": 0.01139796, "auxiliary_loss_mlp": 0.01052902, "balance_loss_clip": 1.02550721, "balance_loss_mlp": 1.03469789, "epoch": 0.08675785359987975, "flos": 20666451818880.0, "grad_norm": 2.652059476450735, "language_loss": 0.78552687, "learning_rate": 3.9263522255276965e-06, "loss": 0.80745387, "num_input_tokens_seen": 30841980, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.0546875, "step": 1443, "time_per_iteration": 2.423205852508545 }, { "auxiliary_loss_clip": 0.01135757, "auxiliary_loss_mlp": 0.01049801, "balance_loss_clip": 1.02301478, "balance_loss_mlp": 1.03341126, "epoch": 0.08681797685254772, "flos": 26939960179200.0, "grad_norm": 1.604042055689223, "language_loss": 0.82368612, "learning_rate": 3.9262506055747596e-06, "loss": 0.84554166, "num_input_tokens_seen": 30863280, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 1.0234375, "step": 1444, "time_per_iteration": 2.4947876930236816 }, { "auxiliary_loss_clip": 0.01139058, "auxiliary_loss_mlp": 0.01054381, "balance_loss_clip": 1.02591348, "balance_loss_mlp": 1.03583121, "epoch": 0.0868781001052157, "flos": 17711856990720.0, "grad_norm": 2.8698712932591914, "language_loss": 0.87018931, "learning_rate": 3.926148916878893e-06, "loss": 0.8921237, "num_input_tokens_seen": 30881710, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.03125, "step": 1445, "time_per_iteration": 2.438837766647339 }, { "auxiliary_loss_clip": 0.01140454, "auxiliary_loss_mlp": 0.01055732, "balance_loss_clip": 1.02794433, "balance_loss_mlp": 1.0385077, "epoch": 0.08693822335788366, "flos": 19895639621760.0, "grad_norm": 1.8482243248422658, "language_loss": 0.81103694, "learning_rate": 3.926047159443727e-06, "loss": 0.83299881, "num_input_tokens_seen": 30900225, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.0234375, "step": 1446, "time_per_iteration": 2.46681547164917 }, { "auxiliary_loss_clip": 0.01033918, "auxiliary_loss_mlp": 0.01006089, "balance_loss_clip": 1.00213134, "balance_loss_mlp": 1.00254512, "epoch": 0.08699834661055163, "flos": 67020878995200.0, "grad_norm": 0.7253279505632818, "language_loss": 0.54759985, "learning_rate": 3.925945333272891e-06, "loss": 0.56799996, "num_input_tokens_seen": 30959580, "router_z_loss_clip": 0.03955078, "router_z_loss_mlp": 0.31445312, "step": 1447, "time_per_iteration": 3.1271657943725586 }, { "auxiliary_loss_clip": 0.01134434, "auxiliary_loss_mlp": 0.0105079, "balance_loss_clip": 1.02214408, "balance_loss_mlp": 1.03559732, "epoch": 0.0870584698632196, "flos": 13479650536320.0, "grad_norm": 2.368430375578426, "language_loss": 0.84644473, "learning_rate": 3.925843438370021e-06, "loss": 0.86829698, "num_input_tokens_seen": 30976775, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.984375, "step": 1448, "time_per_iteration": 2.5329737663269043 }, { "auxiliary_loss_clip": 0.01140375, "auxiliary_loss_mlp": 0.01052026, "balance_loss_clip": 1.02382088, "balance_loss_mlp": 1.03608632, "epoch": 0.08711859311588757, "flos": 16106096695680.0, "grad_norm": 2.6427047070415206, "language_loss": 0.80531889, "learning_rate": 3.925741474738752e-06, "loss": 0.82724291, "num_input_tokens_seen": 30990495, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.046875, "step": 1449, "time_per_iteration": 2.3928134441375732 }, { "auxiliary_loss_clip": 0.01135613, "auxiliary_loss_mlp": 0.01043545, "balance_loss_clip": 1.01903582, "balance_loss_mlp": 1.03557396, "epoch": 0.08717871636855554, "flos": 38470829512320.0, "grad_norm": 1.5675969670229246, "language_loss": 0.71181607, "learning_rate": 3.925639442382724e-06, "loss": 0.73360765, "num_input_tokens_seen": 31014080, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 1.0, "step": 1450, "time_per_iteration": 2.5841941833496094 }, { "auxiliary_loss_clip": 0.01136972, "auxiliary_loss_mlp": 0.01054337, "balance_loss_clip": 1.0267638, "balance_loss_mlp": 1.03637421, "epoch": 0.0872388396212235, "flos": 17599681192320.0, "grad_norm": 1.771054780384107, "language_loss": 0.83204961, "learning_rate": 3.925537341305578e-06, "loss": 0.85396278, "num_input_tokens_seen": 31031210, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.0078125, "step": 1451, "time_per_iteration": 2.3957407474517822 }, { "auxiliary_loss_clip": 0.01133863, "auxiliary_loss_mlp": 0.0105886, "balance_loss_clip": 1.03336096, "balance_loss_mlp": 1.03573465, "epoch": 0.08729896287389148, "flos": 25258368677760.0, "grad_norm": 2.1923603807858347, "language_loss": 0.74339652, "learning_rate": 3.925435171510957e-06, "loss": 0.76532376, "num_input_tokens_seen": 31049710, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.984375, "step": 1452, "time_per_iteration": 2.4761288166046143 }, { "auxiliary_loss_clip": 0.0113897, "auxiliary_loss_mlp": 0.0105538, "balance_loss_clip": 1.02777123, "balance_loss_mlp": 1.03575897, "epoch": 0.08735908612655945, "flos": 15631557229440.0, "grad_norm": 3.009200128085401, "language_loss": 0.79649633, "learning_rate": 3.925332933002507e-06, "loss": 0.81843984, "num_input_tokens_seen": 31066160, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.03125, "step": 1453, "time_per_iteration": 2.4006617069244385 }, { "auxiliary_loss_clip": 0.01135753, "auxiliary_loss_mlp": 0.01051944, "balance_loss_clip": 1.02537227, "balance_loss_mlp": 1.03667951, "epoch": 0.08741920937922741, "flos": 20338617352320.0, "grad_norm": 1.875711402079848, "language_loss": 0.70716834, "learning_rate": 3.925230625783877e-06, "loss": 0.72904533, "num_input_tokens_seen": 31085270, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.9921875, "step": 1454, "time_per_iteration": 2.452676773071289 }, { "auxiliary_loss_clip": 0.01034214, "auxiliary_loss_mlp": 0.0101123, "balance_loss_clip": 1.00710583, "balance_loss_mlp": 1.00364447, "epoch": 0.08747933263189539, "flos": 62816252515200.0, "grad_norm": 0.7824902349415341, "language_loss": 0.58511788, "learning_rate": 3.925128249858719e-06, "loss": 0.60557228, "num_input_tokens_seen": 31148445, "router_z_loss_clip": 0.04125977, "router_z_loss_mlp": 0.3046875, "step": 1455, "time_per_iteration": 3.0468578338623047 }, { "auxiliary_loss_clip": 0.01134979, "auxiliary_loss_mlp": 0.01048629, "balance_loss_clip": 1.02142525, "balance_loss_mlp": 1.03400826, "epoch": 0.08753945588456336, "flos": 33034503576960.0, "grad_norm": 1.5610199804777385, "language_loss": 0.77557188, "learning_rate": 3.925025805230685e-06, "loss": 0.79740798, "num_input_tokens_seen": 31168770, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.0078125, "step": 1456, "time_per_iteration": 2.5630991458892822 }, { "auxiliary_loss_clip": 0.01132645, "auxiliary_loss_mlp": 0.01054823, "balance_loss_clip": 1.02566481, "balance_loss_mlp": 1.03343987, "epoch": 0.08759957913723132, "flos": 35545911206400.0, "grad_norm": 2.3625478373839406, "language_loss": 0.71963835, "learning_rate": 3.924923291903433e-06, "loss": 0.74151307, "num_input_tokens_seen": 31189270, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 0.9921875, "step": 1457, "time_per_iteration": 2.551595449447632 }, { "auxiliary_loss_clip": 0.01130536, "auxiliary_loss_mlp": 0.01044576, "balance_loss_clip": 1.01885056, "balance_loss_mlp": 1.03258061, "epoch": 0.0876597023898993, "flos": 23910092726400.0, "grad_norm": 1.5815599312414572, "language_loss": 0.86436832, "learning_rate": 3.924820709880619e-06, "loss": 0.88611948, "num_input_tokens_seen": 31210385, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.98046875, "step": 1458, "time_per_iteration": 2.4821531772613525 }, { "auxiliary_loss_clip": 0.01140857, "auxiliary_loss_mlp": 0.0104819, "balance_loss_clip": 1.02134418, "balance_loss_mlp": 1.03808141, "epoch": 0.08771982564256726, "flos": 18113043957120.0, "grad_norm": 1.6349072283959376, "language_loss": 0.8053205, "learning_rate": 3.924718059165906e-06, "loss": 0.82721102, "num_input_tokens_seen": 31229745, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 1.03125, "step": 1459, "time_per_iteration": 2.4148335456848145 }, { "auxiliary_loss_clip": 0.01137273, "auxiliary_loss_mlp": 0.01055503, "balance_loss_clip": 1.02746463, "balance_loss_mlp": 1.03450203, "epoch": 0.08777994889523523, "flos": 17711054029440.0, "grad_norm": 2.099146642925664, "language_loss": 0.84267873, "learning_rate": 3.924615339762956e-06, "loss": 0.8646065, "num_input_tokens_seen": 31248280, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.0234375, "step": 1460, "time_per_iteration": 2.470891237258911 }, { "auxiliary_loss_clip": 0.01130985, "auxiliary_loss_mlp": 0.01051176, "balance_loss_clip": 1.02502179, "balance_loss_mlp": 1.03299022, "epoch": 0.0878400721479032, "flos": 12819198746880.0, "grad_norm": 2.6593803727230347, "language_loss": 0.81124723, "learning_rate": 3.924512551675435e-06, "loss": 0.83306885, "num_input_tokens_seen": 31262190, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.98046875, "step": 1461, "time_per_iteration": 2.421342372894287 }, { "auxiliary_loss_clip": 0.01138365, "auxiliary_loss_mlp": 0.01052623, "balance_loss_clip": 1.02715981, "balance_loss_mlp": 1.03707671, "epoch": 0.08790019540057117, "flos": 26391579454080.0, "grad_norm": 1.7269677394716834, "language_loss": 0.76201111, "learning_rate": 3.924409694907011e-06, "loss": 0.783921, "num_input_tokens_seen": 31283690, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.015625, "step": 1462, "time_per_iteration": 2.6441681385040283 }, { "auxiliary_loss_clip": 0.01139179, "auxiliary_loss_mlp": 0.01055463, "balance_loss_clip": 1.02630436, "balance_loss_mlp": 1.03623247, "epoch": 0.08796031865323914, "flos": 19133066505600.0, "grad_norm": 1.7974264920681688, "language_loss": 0.74233687, "learning_rate": 3.924306769461356e-06, "loss": 0.7642833, "num_input_tokens_seen": 31302505, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.03125, "step": 1463, "time_per_iteration": 2.507075309753418 }, { "auxiliary_loss_clip": 0.01137699, "auxiliary_loss_mlp": 0.01051707, "balance_loss_clip": 1.02271533, "balance_loss_mlp": 1.03376389, "epoch": 0.0880204419059071, "flos": 26063186405760.0, "grad_norm": 1.8813185484463697, "language_loss": 0.83247638, "learning_rate": 3.924203775342142e-06, "loss": 0.85437036, "num_input_tokens_seen": 31323070, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.0390625, "step": 1464, "time_per_iteration": 2.582576274871826 }, { "auxiliary_loss_clip": 0.01135477, "auxiliary_loss_mlp": 0.01055543, "balance_loss_clip": 1.02949548, "balance_loss_mlp": 1.03474152, "epoch": 0.08808056515857508, "flos": 22376881969920.0, "grad_norm": 1.893011339821771, "language_loss": 0.78369987, "learning_rate": 3.924100712553046e-06, "loss": 0.80561006, "num_input_tokens_seen": 31341880, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.0078125, "step": 1465, "time_per_iteration": 2.4433510303497314 }, { "auxiliary_loss_clip": 0.01138777, "auxiliary_loss_mlp": 0.01050684, "balance_loss_clip": 1.02324176, "balance_loss_mlp": 1.03619003, "epoch": 0.08814068841124305, "flos": 23184178404480.0, "grad_norm": 2.4569350502347165, "language_loss": 0.84995323, "learning_rate": 3.923997581097744e-06, "loss": 0.87184787, "num_input_tokens_seen": 31361995, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.0234375, "step": 1466, "time_per_iteration": 2.514909267425537 }, { "auxiliary_loss_clip": 0.01137144, "auxiliary_loss_mlp": 0.01049688, "balance_loss_clip": 1.02200794, "balance_loss_mlp": 1.03487492, "epoch": 0.08820081166391101, "flos": 25154117758080.0, "grad_norm": 2.1428941119918825, "language_loss": 0.84030366, "learning_rate": 3.923894380979917e-06, "loss": 0.86217201, "num_input_tokens_seen": 31381515, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.015625, "step": 1467, "time_per_iteration": 2.4532175064086914 }, { "auxiliary_loss_clip": 0.01136589, "auxiliary_loss_mlp": 0.01050094, "balance_loss_clip": 1.02235413, "balance_loss_mlp": 1.03442502, "epoch": 0.08826093491657899, "flos": 22230735552000.0, "grad_norm": 1.8526771256998313, "language_loss": 0.75296938, "learning_rate": 3.9237911122032485e-06, "loss": 0.7748363, "num_input_tokens_seen": 31400345, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.0234375, "step": 1468, "time_per_iteration": 2.4733340740203857 }, { "auxiliary_loss_clip": 0.01134615, "auxiliary_loss_mlp": 0.01045144, "balance_loss_clip": 1.0202769, "balance_loss_mlp": 1.03503835, "epoch": 0.08832105816924696, "flos": 22125751493760.0, "grad_norm": 5.546335350124982, "language_loss": 0.8053264, "learning_rate": 3.923687774771424e-06, "loss": 0.827124, "num_input_tokens_seen": 31419620, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.99609375, "step": 1469, "time_per_iteration": 2.4505934715270996 }, { "auxiliary_loss_clip": 0.01139914, "auxiliary_loss_mlp": 0.01053373, "balance_loss_clip": 1.02658677, "balance_loss_mlp": 1.03651309, "epoch": 0.08838118142191492, "flos": 17565536016000.0, "grad_norm": 1.989153980440257, "language_loss": 0.77890998, "learning_rate": 3.923584368688132e-06, "loss": 0.80084276, "num_input_tokens_seen": 31437970, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 1.03125, "step": 1470, "time_per_iteration": 2.4670047760009766 }, { "auxiliary_loss_clip": 0.01132672, "auxiliary_loss_mlp": 0.01050743, "balance_loss_clip": 1.0239327, "balance_loss_mlp": 1.03389835, "epoch": 0.0884413046745829, "flos": 20776148910720.0, "grad_norm": 1.8836256182417797, "language_loss": 0.83851361, "learning_rate": 3.923480893957061e-06, "loss": 0.86034775, "num_input_tokens_seen": 31457040, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.98828125, "step": 1471, "time_per_iteration": 2.4457244873046875 }, { "auxiliary_loss_clip": 0.01130809, "auxiliary_loss_mlp": 0.01046425, "balance_loss_clip": 1.02320361, "balance_loss_mlp": 1.0354538, "epoch": 0.08850142792725087, "flos": 22124424862080.0, "grad_norm": 5.684584918680441, "language_loss": 0.83179504, "learning_rate": 3.923377350581905e-06, "loss": 0.85356736, "num_input_tokens_seen": 31477520, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.953125, "step": 1472, "time_per_iteration": 2.4930155277252197 }, { "auxiliary_loss_clip": 0.01135368, "auxiliary_loss_mlp": 0.0104367, "balance_loss_clip": 1.01808798, "balance_loss_mlp": 1.03654361, "epoch": 0.08856155117991883, "flos": 22417660304640.0, "grad_norm": 2.271622402276832, "language_loss": 0.82474113, "learning_rate": 3.923273738566359e-06, "loss": 0.84653151, "num_input_tokens_seen": 31495575, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.98828125, "step": 1473, "time_per_iteration": 2.4435055255889893 }, { "auxiliary_loss_clip": 0.01136878, "auxiliary_loss_mlp": 0.01047921, "balance_loss_clip": 1.02318478, "balance_loss_mlp": 1.03624725, "epoch": 0.0886216744325868, "flos": 29935647544320.0, "grad_norm": 1.5623027892790873, "language_loss": 0.78689879, "learning_rate": 3.92317005791412e-06, "loss": 0.80874676, "num_input_tokens_seen": 31520020, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 1.0078125, "step": 1474, "time_per_iteration": 2.5359132289886475 }, { "auxiliary_loss_clip": 0.01132754, "auxiliary_loss_mlp": 0.01048824, "balance_loss_clip": 1.02194262, "balance_loss_mlp": 1.03589225, "epoch": 0.08868179768525478, "flos": 23981839303680.0, "grad_norm": 1.6698744687384766, "language_loss": 0.79016858, "learning_rate": 3.923066308628889e-06, "loss": 0.81198436, "num_input_tokens_seen": 31539265, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.96875, "step": 1475, "time_per_iteration": 2.441384792327881 }, { "auxiliary_loss_clip": 0.01133227, "auxiliary_loss_mlp": 0.01045926, "balance_loss_clip": 1.02074862, "balance_loss_mlp": 1.03375602, "epoch": 0.08874192093792274, "flos": 43175934599040.0, "grad_norm": 1.5978715824027918, "language_loss": 0.73998678, "learning_rate": 3.922962490714368e-06, "loss": 0.76177835, "num_input_tokens_seen": 31563425, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.99609375, "step": 1476, "time_per_iteration": 4.0761120319366455 }, { "auxiliary_loss_clip": 0.01136907, "auxiliary_loss_mlp": 0.01049839, "balance_loss_clip": 1.02361321, "balance_loss_mlp": 1.03561664, "epoch": 0.0888020441905907, "flos": 32851104871680.0, "grad_norm": 1.8367264592435533, "language_loss": 0.74373507, "learning_rate": 3.922858604174262e-06, "loss": 0.76560253, "num_input_tokens_seen": 31584525, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.015625, "step": 1477, "time_per_iteration": 2.597036123275757 }, { "auxiliary_loss_clip": 0.0113435, "auxiliary_loss_mlp": 0.01055158, "balance_loss_clip": 1.02894378, "balance_loss_mlp": 1.03508937, "epoch": 0.08886216744325869, "flos": 23148217837440.0, "grad_norm": 1.8871903181689216, "language_loss": 0.86721641, "learning_rate": 3.922754649012279e-06, "loss": 0.88911152, "num_input_tokens_seen": 31603325, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.9921875, "step": 1478, "time_per_iteration": 3.840590476989746 }, { "auxiliary_loss_clip": 0.0113549, "auxiliary_loss_mlp": 0.01053253, "balance_loss_clip": 1.02746797, "balance_loss_mlp": 1.03530025, "epoch": 0.08892229069592665, "flos": 23330464467840.0, "grad_norm": 3.261643036609131, "language_loss": 0.77389818, "learning_rate": 3.922650625232128e-06, "loss": 0.79578561, "num_input_tokens_seen": 31624820, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.0, "step": 1479, "time_per_iteration": 4.07110071182251 }, { "auxiliary_loss_clip": 0.01130919, "auxiliary_loss_mlp": 0.01043634, "balance_loss_clip": 1.01843274, "balance_loss_mlp": 1.03348565, "epoch": 0.08898241394859462, "flos": 26212579580160.0, "grad_norm": 2.419935582481106, "language_loss": 0.78363329, "learning_rate": 3.922546532837522e-06, "loss": 0.80537885, "num_input_tokens_seen": 31646080, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.9765625, "step": 1480, "time_per_iteration": 4.018751382827759 }, { "auxiliary_loss_clip": 0.01133847, "auxiliary_loss_mlp": 0.01052865, "balance_loss_clip": 1.02467179, "balance_loss_mlp": 1.03337443, "epoch": 0.0890425372012626, "flos": 23549474626560.0, "grad_norm": 2.081024641177727, "language_loss": 0.66308194, "learning_rate": 3.9224423718321756e-06, "loss": 0.68494904, "num_input_tokens_seen": 31665770, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.0078125, "step": 1481, "time_per_iteration": 2.4800703525543213 }, { "auxiliary_loss_clip": 0.01134249, "auxiliary_loss_mlp": 0.01047694, "balance_loss_clip": 1.02288699, "balance_loss_mlp": 1.03551579, "epoch": 0.08910266045393056, "flos": 23001687394560.0, "grad_norm": 1.864432448296234, "language_loss": 0.9653616, "learning_rate": 3.922338142219806e-06, "loss": 0.98718101, "num_input_tokens_seen": 31683805, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.98828125, "step": 1482, "time_per_iteration": -0.18274378776550293 }, { "auxiliary_loss_clip": 0.01137095, "auxiliary_loss_mlp": 0.01052594, "balance_loss_clip": 1.02611768, "balance_loss_mlp": 1.03577983, "epoch": 0.08916278370659853, "flos": 31935298331520.0, "grad_norm": 1.9234200730524673, "language_loss": 0.7877143, "learning_rate": 3.922233844004133e-06, "loss": 0.8096112, "num_input_tokens_seen": 31704630, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.015625, "step": 1483, "time_per_iteration": 2.5001602172851562 }, { "auxiliary_loss_clip": 0.01132379, "auxiliary_loss_mlp": 0.01056567, "balance_loss_clip": 1.03047252, "balance_loss_mlp": 1.03430367, "epoch": 0.08922290695926649, "flos": 17529435803520.0, "grad_norm": 2.3630293380418683, "language_loss": 0.85483754, "learning_rate": 3.922129477188879e-06, "loss": 0.87672698, "num_input_tokens_seen": 31723255, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.98046875, "step": 1484, "time_per_iteration": 2.4240922927856445 }, { "auxiliary_loss_clip": 0.01142654, "auxiliary_loss_mlp": 0.01049875, "balance_loss_clip": 1.02124047, "balance_loss_mlp": 1.03831923, "epoch": 0.08928303021193447, "flos": 32123689361280.0, "grad_norm": 1.6040066629630427, "language_loss": 0.80224192, "learning_rate": 3.922025041777768e-06, "loss": 0.82416725, "num_input_tokens_seen": 31747045, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.0390625, "step": 1485, "time_per_iteration": 2.524641275405884 }, { "auxiliary_loss_clip": 0.01133156, "auxiliary_loss_mlp": 0.01051077, "balance_loss_clip": 1.02557778, "balance_loss_mlp": 1.03244472, "epoch": 0.08934315346460243, "flos": 22124180482560.0, "grad_norm": 2.0853469502904693, "language_loss": 0.82999718, "learning_rate": 3.921920537774528e-06, "loss": 0.85183954, "num_input_tokens_seen": 31766615, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.0078125, "step": 1486, "time_per_iteration": 2.4714694023132324 }, { "auxiliary_loss_clip": 0.01132548, "auxiliary_loss_mlp": 0.01056082, "balance_loss_clip": 1.02856851, "balance_loss_mlp": 1.03458941, "epoch": 0.0894032767172704, "flos": 22564470038400.0, "grad_norm": 1.6977251970071152, "language_loss": 0.76376575, "learning_rate": 3.921815965182887e-06, "loss": 0.78565204, "num_input_tokens_seen": 31785855, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.9765625, "step": 1487, "time_per_iteration": 2.472322940826416 }, { "auxiliary_loss_clip": 0.01135312, "auxiliary_loss_mlp": 0.01050471, "balance_loss_clip": 1.02255261, "balance_loss_mlp": 1.03422713, "epoch": 0.08946339996993838, "flos": 20192366200320.0, "grad_norm": 2.0317647278322477, "language_loss": 0.82573104, "learning_rate": 3.921711324006578e-06, "loss": 0.8475889, "num_input_tokens_seen": 31804210, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.0078125, "step": 1488, "time_per_iteration": 2.4572408199310303 }, { "auxiliary_loss_clip": 0.01132356, "auxiliary_loss_mlp": 0.01048558, "balance_loss_clip": 1.02412033, "balance_loss_mlp": 1.0337038, "epoch": 0.08952352322260634, "flos": 48358372060800.0, "grad_norm": 2.8944739791810865, "language_loss": 0.72003675, "learning_rate": 3.921606614249335e-06, "loss": 0.74184585, "num_input_tokens_seen": 31826150, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.98828125, "step": 1489, "time_per_iteration": 2.759321451187134 }, { "auxiliary_loss_clip": 0.01130591, "auxiliary_loss_mlp": 0.01049125, "balance_loss_clip": 1.02280331, "balance_loss_mlp": 1.03231263, "epoch": 0.08958364647527431, "flos": 31791805176960.0, "grad_norm": 1.7753833348466836, "language_loss": 0.89858687, "learning_rate": 3.921501835914894e-06, "loss": 0.92038399, "num_input_tokens_seen": 31848060, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.984375, "step": 1490, "time_per_iteration": 2.541839122772217 }, { "auxiliary_loss_clip": 0.01140402, "auxiliary_loss_mlp": 0.01056209, "balance_loss_clip": 1.02856421, "balance_loss_mlp": 1.03494847, "epoch": 0.08964376972794229, "flos": 23367053439360.0, "grad_norm": 2.3138583614972386, "language_loss": 0.73459613, "learning_rate": 3.921396989006997e-06, "loss": 0.75656223, "num_input_tokens_seen": 31870040, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.0546875, "step": 1491, "time_per_iteration": 2.482553482055664 }, { "auxiliary_loss_clip": 0.01132926, "auxiliary_loss_mlp": 0.01044359, "balance_loss_clip": 1.01953959, "balance_loss_mlp": 1.03456628, "epoch": 0.08970389298061025, "flos": 23293666028160.0, "grad_norm": 1.9212727717432074, "language_loss": 0.76900983, "learning_rate": 3.9212920735293824e-06, "loss": 0.79078269, "num_input_tokens_seen": 31890400, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.984375, "step": 1492, "time_per_iteration": 2.4586904048919678 }, { "auxiliary_loss_clip": 0.01132489, "auxiliary_loss_mlp": 0.01048568, "balance_loss_clip": 1.02209187, "balance_loss_mlp": 1.03553343, "epoch": 0.08976401623327822, "flos": 33760417898880.0, "grad_norm": 2.0921066681155245, "language_loss": 0.70533705, "learning_rate": 3.921187089485796e-06, "loss": 0.72714764, "num_input_tokens_seen": 31913435, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.96875, "step": 1493, "time_per_iteration": 2.540811777114868 }, { "auxiliary_loss_clip": 0.01132349, "auxiliary_loss_mlp": 0.01048068, "balance_loss_clip": 1.02154422, "balance_loss_mlp": 1.03320432, "epoch": 0.08982413948594618, "flos": 23910302194560.0, "grad_norm": 1.8346011499961192, "language_loss": 0.86851084, "learning_rate": 3.921082036879985e-06, "loss": 0.89031506, "num_input_tokens_seen": 31932435, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.9921875, "step": 1494, "time_per_iteration": 2.462740421295166 }, { "auxiliary_loss_clip": 0.01133166, "auxiliary_loss_mlp": 0.01054281, "balance_loss_clip": 1.02751827, "balance_loss_mlp": 1.03466368, "epoch": 0.08988426273861416, "flos": 16836584405760.0, "grad_norm": 1.7800069294718281, "language_loss": 0.83029783, "learning_rate": 3.9209769157156976e-06, "loss": 0.85217232, "num_input_tokens_seen": 31950125, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.984375, "step": 1495, "time_per_iteration": 2.403963804244995 }, { "auxiliary_loss_clip": 0.0113773, "auxiliary_loss_mlp": 0.01058836, "balance_loss_clip": 1.03222859, "balance_loss_mlp": 1.03652573, "epoch": 0.08994438599128213, "flos": 14792489591040.0, "grad_norm": 1.8861377359210703, "language_loss": 0.69612455, "learning_rate": 3.920871725996685e-06, "loss": 0.7180903, "num_input_tokens_seen": 31968050, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.015625, "step": 1496, "time_per_iteration": 2.3890879154205322 }, { "auxiliary_loss_clip": 0.01132691, "auxiliary_loss_mlp": 0.01049185, "balance_loss_clip": 1.024616, "balance_loss_mlp": 1.0346992, "epoch": 0.09000450924395009, "flos": 17383359208320.0, "grad_norm": 1.665119356571217, "language_loss": 0.79898089, "learning_rate": 3.920766467726702e-06, "loss": 0.82079965, "num_input_tokens_seen": 31985675, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.9765625, "step": 1497, "time_per_iteration": 2.4362621307373047 }, { "auxiliary_loss_clip": 0.01135802, "auxiliary_loss_mlp": 0.01049072, "balance_loss_clip": 1.02345371, "balance_loss_mlp": 1.03321958, "epoch": 0.09006463249661807, "flos": 24279159375360.0, "grad_norm": 2.704740676644494, "language_loss": 0.8292343, "learning_rate": 3.920661140909505e-06, "loss": 0.85108304, "num_input_tokens_seen": 32005180, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.0234375, "step": 1498, "time_per_iteration": 2.4597747325897217 }, { "auxiliary_loss_clip": 0.01136483, "auxiliary_loss_mlp": 0.01056237, "balance_loss_clip": 1.0302614, "balance_loss_mlp": 1.03528881, "epoch": 0.09012475574928604, "flos": 13661094205440.0, "grad_norm": 4.45428008519945, "language_loss": 0.78773302, "learning_rate": 3.920555745548851e-06, "loss": 0.8096602, "num_input_tokens_seen": 32022970, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.015625, "step": 1499, "time_per_iteration": 2.419222593307495 }, { "auxiliary_loss_clip": 0.01130284, "auxiliary_loss_mlp": 0.01055989, "balance_loss_clip": 1.03046608, "balance_loss_mlp": 1.03425586, "epoch": 0.090184879001954, "flos": 23326728952320.0, "grad_norm": 1.7092410090117585, "language_loss": 0.93098229, "learning_rate": 3.920450281648503e-06, "loss": 0.95284498, "num_input_tokens_seen": 32043055, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.9609375, "step": 1500, "time_per_iteration": 2.435533285140991 }, { "auxiliary_loss_clip": 0.01133888, "auxiliary_loss_mlp": 0.01047049, "balance_loss_clip": 1.02153802, "balance_loss_mlp": 1.03357148, "epoch": 0.09024500225462198, "flos": 23001582660480.0, "grad_norm": 2.196175652597993, "language_loss": 0.74589396, "learning_rate": 3.920344749212226e-06, "loss": 0.76770335, "num_input_tokens_seen": 32061900, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.0, "step": 1501, "time_per_iteration": 2.46028733253479 }, { "auxiliary_loss_clip": 0.01044632, "auxiliary_loss_mlp": 0.01025841, "balance_loss_clip": 1.02193058, "balance_loss_mlp": 1.00976682, "epoch": 0.09030512550728995, "flos": 62185966007040.0, "grad_norm": 0.7316882177862591, "language_loss": 0.58222729, "learning_rate": 3.920239148243783e-06, "loss": 0.60293198, "num_input_tokens_seen": 32122745, "router_z_loss_clip": 0.0390625, "router_z_loss_mlp": 0.34765625, "step": 1502, "time_per_iteration": 3.0684406757354736 }, { "auxiliary_loss_clip": 0.01127273, "auxiliary_loss_mlp": 0.01045086, "balance_loss_clip": 1.02217412, "balance_loss_mlp": 1.03078341, "epoch": 0.09036524875995791, "flos": 38799152737920.0, "grad_norm": 2.273966761436493, "language_loss": 0.69753504, "learning_rate": 3.920133478746944e-06, "loss": 0.71925861, "num_input_tokens_seen": 32145125, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.96484375, "step": 1503, "time_per_iteration": 2.6171352863311768 }, { "auxiliary_loss_clip": 0.01133858, "auxiliary_loss_mlp": 0.01047987, "balance_loss_clip": 1.02242839, "balance_loss_mlp": 1.03417444, "epoch": 0.09042537201262588, "flos": 21688987985280.0, "grad_norm": 2.231932536970297, "language_loss": 0.85978246, "learning_rate": 3.920027740725481e-06, "loss": 0.88160092, "num_input_tokens_seen": 32166255, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.0, "step": 1504, "time_per_iteration": 2.4569787979125977 }, { "auxiliary_loss_clip": 0.01140245, "auxiliary_loss_mlp": 0.01054718, "balance_loss_clip": 1.0257622, "balance_loss_mlp": 1.03655267, "epoch": 0.09048549526529386, "flos": 22266102625920.0, "grad_norm": 2.087997650270069, "language_loss": 0.72479331, "learning_rate": 3.919921934183167e-06, "loss": 0.7467429, "num_input_tokens_seen": 32184010, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.0390625, "step": 1505, "time_per_iteration": 2.4890501499176025 }, { "auxiliary_loss_clip": 0.01132126, "auxiliary_loss_mlp": 0.01046802, "balance_loss_clip": 1.02039695, "balance_loss_mlp": 1.03433979, "epoch": 0.09054561851796182, "flos": 14610068403840.0, "grad_norm": 2.0171736463787093, "language_loss": 0.80757898, "learning_rate": 3.919816059123778e-06, "loss": 0.82936823, "num_input_tokens_seen": 32201635, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.9765625, "step": 1506, "time_per_iteration": 2.4429192543029785 }, { "auxiliary_loss_clip": 0.01132694, "auxiliary_loss_mlp": 0.01044174, "balance_loss_clip": 1.0197835, "balance_loss_mlp": 1.03535104, "epoch": 0.09060574177062979, "flos": 27634941169920.0, "grad_norm": 1.9727846762699803, "language_loss": 0.75965023, "learning_rate": 3.919710115551092e-06, "loss": 0.78141892, "num_input_tokens_seen": 32221940, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.97265625, "step": 1507, "time_per_iteration": 2.5400660037994385 }, { "auxiliary_loss_clip": 0.01042598, "auxiliary_loss_mlp": 0.01005991, "balance_loss_clip": 1.00198567, "balance_loss_mlp": 1.00916338, "epoch": 0.09066586502329776, "flos": 66082657495680.0, "grad_norm": 0.7293385187982612, "language_loss": 0.57651293, "learning_rate": 3.91960410346889e-06, "loss": 0.59699887, "num_input_tokens_seen": 32276495, "router_z_loss_clip": 0.04003906, "router_z_loss_mlp": 0.33398438, "step": 1508, "time_per_iteration": 2.967487335205078 }, { "auxiliary_loss_clip": 0.0113586, "auxiliary_loss_mlp": 0.0105616, "balance_loss_clip": 1.02919483, "balance_loss_mlp": 1.03609443, "epoch": 0.09072598827596573, "flos": 18915452801280.0, "grad_norm": 2.2190963044476137, "language_loss": 0.85160971, "learning_rate": 3.919498022880955e-06, "loss": 0.87352985, "num_input_tokens_seen": 32294130, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.99609375, "step": 1509, "time_per_iteration": 2.5288822650909424 }, { "auxiliary_loss_clip": 0.01141463, "auxiliary_loss_mlp": 0.01054211, "balance_loss_clip": 1.02641118, "balance_loss_mlp": 1.03611588, "epoch": 0.0907861115286337, "flos": 24820732385280.0, "grad_norm": 2.3260571756947472, "language_loss": 0.84302211, "learning_rate": 3.9193918737910735e-06, "loss": 0.86497879, "num_input_tokens_seen": 32313555, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.046875, "step": 1510, "time_per_iteration": 2.466231346130371 }, { "auxiliary_loss_clip": 0.01134825, "auxiliary_loss_mlp": 0.01050364, "balance_loss_clip": 1.02267158, "balance_loss_mlp": 1.03401542, "epoch": 0.09084623478130167, "flos": 21651770609280.0, "grad_norm": 1.912535516932508, "language_loss": 0.85478687, "learning_rate": 3.919285656203033e-06, "loss": 0.87663877, "num_input_tokens_seen": 32331430, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.0078125, "step": 1511, "time_per_iteration": 2.4451420307159424 }, { "auxiliary_loss_clip": 0.01132187, "auxiliary_loss_mlp": 0.01048022, "balance_loss_clip": 1.02150965, "balance_loss_mlp": 1.03599679, "epoch": 0.09090635803396964, "flos": 27637943546880.0, "grad_norm": 1.7413602600900544, "language_loss": 0.85064685, "learning_rate": 3.919179370120624e-06, "loss": 0.87244892, "num_input_tokens_seen": 32353705, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.96484375, "step": 1512, "time_per_iteration": 2.4990103244781494 }, { "auxiliary_loss_clip": 0.0112873, "auxiliary_loss_mlp": 0.0104476, "balance_loss_clip": 1.02056026, "balance_loss_mlp": 1.03212404, "epoch": 0.0909664812866376, "flos": 17668355569920.0, "grad_norm": 2.479651232075728, "language_loss": 0.86426342, "learning_rate": 3.919073015547641e-06, "loss": 0.88599831, "num_input_tokens_seen": 32370520, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.96484375, "step": 1513, "time_per_iteration": 2.4611003398895264 }, { "auxiliary_loss_clip": 0.01134348, "auxiliary_loss_mlp": 0.01049628, "balance_loss_clip": 1.02377188, "balance_loss_mlp": 1.03542376, "epoch": 0.09102660453930557, "flos": 23950312479360.0, "grad_norm": 1.8659367863772227, "language_loss": 0.86158764, "learning_rate": 3.918966592487878e-06, "loss": 0.88342738, "num_input_tokens_seen": 32389105, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.98828125, "step": 1514, "time_per_iteration": 2.4910011291503906 }, { "auxiliary_loss_clip": 0.01134255, "auxiliary_loss_mlp": 0.01058063, "balance_loss_clip": 1.03387547, "balance_loss_mlp": 1.03549898, "epoch": 0.09108672779197355, "flos": 25811741727360.0, "grad_norm": 1.844292454397013, "language_loss": 0.90314281, "learning_rate": 3.918860100945134e-06, "loss": 0.92506593, "num_input_tokens_seen": 32408065, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.984375, "step": 1515, "time_per_iteration": 3.9674715995788574 }, { "auxiliary_loss_clip": 0.01134262, "auxiliary_loss_mlp": 0.01046491, "balance_loss_clip": 1.02043211, "balance_loss_mlp": 1.03435397, "epoch": 0.09114685104464151, "flos": 29638292561280.0, "grad_norm": 2.112813599939862, "language_loss": 0.85246992, "learning_rate": 3.9187535409232076e-06, "loss": 0.87427747, "num_input_tokens_seen": 32427225, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.0, "step": 1516, "time_per_iteration": 2.516740322113037 }, { "auxiliary_loss_clip": 0.01138227, "auxiliary_loss_mlp": 0.01050867, "balance_loss_clip": 1.02510583, "balance_loss_mlp": 1.03607202, "epoch": 0.09120697429730948, "flos": 33728227758720.0, "grad_norm": 1.4460689545829237, "language_loss": 0.80797648, "learning_rate": 3.918646912425904e-06, "loss": 0.82986748, "num_input_tokens_seen": 32450510, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.0234375, "step": 1517, "time_per_iteration": 3.999890089035034 }, { "auxiliary_loss_clip": 0.01140348, "auxiliary_loss_mlp": 0.01059854, "balance_loss_clip": 1.03282976, "balance_loss_mlp": 1.03763103, "epoch": 0.09126709754997746, "flos": 18400519025280.0, "grad_norm": 1.570144501988006, "language_loss": 0.77740484, "learning_rate": 3.918540215457027e-06, "loss": 0.79940683, "num_input_tokens_seen": 32468425, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.03125, "step": 1518, "time_per_iteration": 2.447437047958374 }, { "auxiliary_loss_clip": 0.01133414, "auxiliary_loss_mlp": 0.01051801, "balance_loss_clip": 1.02459717, "balance_loss_mlp": 1.03456867, "epoch": 0.09132722080264542, "flos": 22090838267520.0, "grad_norm": 1.6758766032308245, "language_loss": 0.86130202, "learning_rate": 3.918433450020386e-06, "loss": 0.88315415, "num_input_tokens_seen": 32487510, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.98828125, "step": 1519, "time_per_iteration": 3.8800432682037354 }, { "auxiliary_loss_clip": 0.01136126, "auxiliary_loss_mlp": 0.01051936, "balance_loss_clip": 1.02413607, "balance_loss_mlp": 1.03529108, "epoch": 0.09138734405531339, "flos": 21032062243200.0, "grad_norm": 2.3850002057706474, "language_loss": 0.72785783, "learning_rate": 3.9183266161197885e-06, "loss": 0.74973845, "num_input_tokens_seen": 32507250, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.0078125, "step": 1520, "time_per_iteration": 3.880931854248047 }, { "auxiliary_loss_clip": 0.01135703, "auxiliary_loss_mlp": 0.01054564, "balance_loss_clip": 1.02647829, "balance_loss_mlp": 1.03522754, "epoch": 0.09144746730798137, "flos": 20082913488000.0, "grad_norm": 2.5358907338691727, "language_loss": 0.85057628, "learning_rate": 3.91821971375905e-06, "loss": 0.87247896, "num_input_tokens_seen": 32526045, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.0, "step": 1521, "time_per_iteration": 2.469526767730713 }, { "auxiliary_loss_clip": 0.01136681, "auxiliary_loss_mlp": 0.0105437, "balance_loss_clip": 1.02826309, "balance_loss_mlp": 1.03459895, "epoch": 0.09150759056064933, "flos": 22777265975040.0, "grad_norm": 2.7776145419894873, "language_loss": 0.83937508, "learning_rate": 3.918112742941983e-06, "loss": 0.86128557, "num_input_tokens_seen": 32546575, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.015625, "step": 1522, "time_per_iteration": 2.5015242099761963 }, { "auxiliary_loss_clip": 0.01129337, "auxiliary_loss_mlp": 0.01053417, "balance_loss_clip": 1.0273335, "balance_loss_mlp": 1.03404737, "epoch": 0.0915677138133173, "flos": 27562950213120.0, "grad_norm": 1.9851311064106862, "language_loss": 0.81124741, "learning_rate": 3.9180057036724066e-06, "loss": 0.83307493, "num_input_tokens_seen": 32568795, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.953125, "step": 1523, "time_per_iteration": 2.5210797786712646 }, { "auxiliary_loss_clip": 0.01135162, "auxiliary_loss_mlp": 0.01050573, "balance_loss_clip": 1.02509856, "balance_loss_mlp": 1.03610897, "epoch": 0.09162783706598528, "flos": 17673836653440.0, "grad_norm": 2.434152104453912, "language_loss": 0.74915415, "learning_rate": 3.9178985959541406e-06, "loss": 0.77101147, "num_input_tokens_seen": 32587010, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.9921875, "step": 1524, "time_per_iteration": 2.440109968185425 }, { "auxiliary_loss_clip": 0.01134508, "auxiliary_loss_mlp": 0.01055211, "balance_loss_clip": 1.02829385, "balance_loss_mlp": 1.03354788, "epoch": 0.09168796031865324, "flos": 18477223015680.0, "grad_norm": 2.5147764071717886, "language_loss": 0.86025923, "learning_rate": 3.917791419791006e-06, "loss": 0.88215643, "num_input_tokens_seen": 32602375, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.0078125, "step": 1525, "time_per_iteration": 2.432311773300171 }, { "auxiliary_loss_clip": 0.01133316, "auxiliary_loss_mlp": 0.0104995, "balance_loss_clip": 1.02396262, "balance_loss_mlp": 1.03515005, "epoch": 0.0917480835713212, "flos": 29386324212480.0, "grad_norm": 2.096285881342677, "language_loss": 0.7531842, "learning_rate": 3.91768417518683e-06, "loss": 0.77501684, "num_input_tokens_seen": 32621460, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.98046875, "step": 1526, "time_per_iteration": 2.508889675140381 }, { "auxiliary_loss_clip": 0.01133753, "auxiliary_loss_mlp": 0.01048868, "balance_loss_clip": 1.02340519, "balance_loss_mlp": 1.0355401, "epoch": 0.09180820682398917, "flos": 19828222053120.0, "grad_norm": 2.155732744211786, "language_loss": 0.77275509, "learning_rate": 3.917576862145438e-06, "loss": 0.79458129, "num_input_tokens_seen": 32640440, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.984375, "step": 1527, "time_per_iteration": 2.504575490951538 }, { "auxiliary_loss_clip": 0.01134489, "auxiliary_loss_mlp": 0.01052293, "balance_loss_clip": 1.02495801, "balance_loss_mlp": 1.03549433, "epoch": 0.09186833007665715, "flos": 23840720121600.0, "grad_norm": 2.472114783236302, "language_loss": 0.78673851, "learning_rate": 3.91746948067066e-06, "loss": 0.80860639, "num_input_tokens_seen": 32660020, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.9921875, "step": 1528, "time_per_iteration": 2.497410297393799 }, { "auxiliary_loss_clip": 0.01133446, "auxiliary_loss_mlp": 0.01044216, "balance_loss_clip": 1.01814508, "balance_loss_mlp": 1.03460348, "epoch": 0.09192845332932512, "flos": 12931898215680.0, "grad_norm": 2.7292163851837303, "language_loss": 0.77312195, "learning_rate": 3.91736203076633e-06, "loss": 0.79489857, "num_input_tokens_seen": 32678170, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.98828125, "step": 1529, "time_per_iteration": 2.433762550354004 }, { "auxiliary_loss_clip": 0.01133114, "auxiliary_loss_mlp": 0.01049329, "balance_loss_clip": 1.02217329, "balance_loss_mlp": 1.03224063, "epoch": 0.09198857658199308, "flos": 24567123202560.0, "grad_norm": 1.8967014901884687, "language_loss": 0.8285197, "learning_rate": 3.9172545124362795e-06, "loss": 0.85034418, "num_input_tokens_seen": 32697540, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.0078125, "step": 1530, "time_per_iteration": 2.4446945190429688 }, { "auxiliary_loss_clip": 0.0113179, "auxiliary_loss_mlp": 0.01055151, "balance_loss_clip": 1.02943802, "balance_loss_mlp": 1.03457022, "epoch": 0.09204869983466106, "flos": 20265893256960.0, "grad_norm": 2.6123554775526823, "language_loss": 0.83155543, "learning_rate": 3.9171469256843484e-06, "loss": 0.85342479, "num_input_tokens_seen": 32716805, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.96875, "step": 1531, "time_per_iteration": 2.4508118629455566 }, { "auxiliary_loss_clip": 0.01131607, "auxiliary_loss_mlp": 0.01050721, "balance_loss_clip": 1.02392328, "balance_loss_mlp": 1.03305411, "epoch": 0.09210882308732903, "flos": 20884624104960.0, "grad_norm": 3.281697007764632, "language_loss": 0.81480652, "learning_rate": 3.917039270514375e-06, "loss": 0.83662981, "num_input_tokens_seen": 32736385, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.984375, "step": 1532, "time_per_iteration": 2.4293484687805176 }, { "auxiliary_loss_clip": 0.01135024, "auxiliary_loss_mlp": 0.01053959, "balance_loss_clip": 1.02642202, "balance_loss_mlp": 1.0354228, "epoch": 0.09216894633999699, "flos": 30955006776960.0, "grad_norm": 2.533561225734148, "language_loss": 0.83641398, "learning_rate": 3.9169315469302e-06, "loss": 0.85830384, "num_input_tokens_seen": 32757140, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.99609375, "step": 1533, "time_per_iteration": 2.507021188735962 }, { "auxiliary_loss_clip": 0.01133847, "auxiliary_loss_mlp": 0.01046285, "balance_loss_clip": 1.02035689, "balance_loss_mlp": 1.03520799, "epoch": 0.09222906959266497, "flos": 13150733817600.0, "grad_norm": 2.012719717570496, "language_loss": 0.90133536, "learning_rate": 3.91682375493567e-06, "loss": 0.92313659, "num_input_tokens_seen": 32774860, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.984375, "step": 1534, "time_per_iteration": 2.387143611907959 }, { "auxiliary_loss_clip": 0.01133891, "auxiliary_loss_mlp": 0.01064751, "balance_loss_clip": 1.03456783, "balance_loss_mlp": 1.03374577, "epoch": 0.09228919284533293, "flos": 25993290130560.0, "grad_norm": 1.9768533857068011, "language_loss": 0.75789332, "learning_rate": 3.916715894534631e-06, "loss": 0.77987975, "num_input_tokens_seen": 32795250, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.0, "step": 1535, "time_per_iteration": 2.490509271621704 }, { "auxiliary_loss_clip": 0.01127005, "auxiliary_loss_mlp": 0.01048225, "balance_loss_clip": 1.02288067, "balance_loss_mlp": 1.03255856, "epoch": 0.0923493160980009, "flos": 18659818759680.0, "grad_norm": 1.6612240374319367, "language_loss": 0.81129748, "learning_rate": 3.916607965730932e-06, "loss": 0.83304977, "num_input_tokens_seen": 32813805, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.9453125, "step": 1536, "time_per_iteration": 2.3843331336975098 }, { "auxiliary_loss_clip": 0.0112877, "auxiliary_loss_mlp": 0.01049071, "balance_loss_clip": 1.02415609, "balance_loss_mlp": 1.03268063, "epoch": 0.09240943935066886, "flos": 21139559919360.0, "grad_norm": 3.3094259332098237, "language_loss": 0.89304686, "learning_rate": 3.9164999685284245e-06, "loss": 0.91482526, "num_input_tokens_seen": 32830960, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.9609375, "step": 1537, "time_per_iteration": 2.4380135536193848 }, { "auxiliary_loss_clip": 0.01130553, "auxiliary_loss_mlp": 0.01052005, "balance_loss_clip": 1.02496839, "balance_loss_mlp": 1.03329217, "epoch": 0.09246956260333684, "flos": 20591458485120.0, "grad_norm": 2.218947463308206, "language_loss": 0.81051397, "learning_rate": 3.916391902930963e-06, "loss": 0.83233953, "num_input_tokens_seen": 32848275, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.97265625, "step": 1538, "time_per_iteration": 2.4207215309143066 }, { "auxiliary_loss_clip": 0.01047669, "auxiliary_loss_mlp": 0.01006016, "balance_loss_clip": 1.00153339, "balance_loss_mlp": 1.01349711, "epoch": 0.09252968585600481, "flos": 67555153664640.0, "grad_norm": 0.7330743763034323, "language_loss": 0.57387245, "learning_rate": 3.916283768942404e-06, "loss": 0.59440935, "num_input_tokens_seen": 32917730, "router_z_loss_clip": 0.04492188, "router_z_loss_mlp": 0.34179688, "step": 1539, "time_per_iteration": 3.207620620727539 }, { "auxiliary_loss_clip": 0.01134897, "auxiliary_loss_mlp": 0.01046346, "balance_loss_clip": 1.02051282, "balance_loss_mlp": 1.03614211, "epoch": 0.09258980910867277, "flos": 17382905360640.0, "grad_norm": 3.029941687293074, "language_loss": 0.67660999, "learning_rate": 3.916175566566607e-06, "loss": 0.69842243, "num_input_tokens_seen": 32934910, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.98828125, "step": 1540, "time_per_iteration": 2.4448249340057373 }, { "auxiliary_loss_clip": 0.01133389, "auxiliary_loss_mlp": 0.01046343, "balance_loss_clip": 1.01944923, "balance_loss_mlp": 1.03585327, "epoch": 0.09264993236134075, "flos": 19864880847360.0, "grad_norm": 1.941268230130487, "language_loss": 0.83593309, "learning_rate": 3.916067295807433e-06, "loss": 0.85773039, "num_input_tokens_seen": 32953840, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.9765625, "step": 1541, "time_per_iteration": 2.517001152038574 }, { "auxiliary_loss_clip": 0.01044401, "auxiliary_loss_mlp": 0.01004169, "balance_loss_clip": 1.00006866, "balance_loss_mlp": 1.0115366, "epoch": 0.09271005561400872, "flos": 62281558909440.0, "grad_norm": 0.8820698294636038, "language_loss": 0.61850953, "learning_rate": 3.915958956668745e-06, "loss": 0.63899529, "num_input_tokens_seen": 33011410, "router_z_loss_clip": 0.04101562, "router_z_loss_mlp": 0.328125, "step": 1542, "time_per_iteration": 3.1013567447662354 }, { "auxiliary_loss_clip": 0.01131936, "auxiliary_loss_mlp": 0.01053385, "balance_loss_clip": 1.02901816, "balance_loss_mlp": 1.0335089, "epoch": 0.09277017886667668, "flos": 23328788722560.0, "grad_norm": 1.826825102411416, "language_loss": 0.82780075, "learning_rate": 3.915850549154412e-06, "loss": 0.84965402, "num_input_tokens_seen": 33031675, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.984375, "step": 1543, "time_per_iteration": 2.4943008422851562 }, { "auxiliary_loss_clip": 0.0113051, "auxiliary_loss_mlp": 0.01050692, "balance_loss_clip": 1.02413249, "balance_loss_mlp": 1.03472304, "epoch": 0.09283030211934466, "flos": 54743183435520.0, "grad_norm": 1.8056748607287116, "language_loss": 0.72402155, "learning_rate": 3.9157420732682995e-06, "loss": 0.74583352, "num_input_tokens_seen": 33056355, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.95703125, "step": 1544, "time_per_iteration": 2.719080924987793 }, { "auxiliary_loss_clip": 0.0113308, "auxiliary_loss_mlp": 0.01047957, "balance_loss_clip": 1.02156389, "balance_loss_mlp": 1.03499806, "epoch": 0.09289042537201263, "flos": 30333517931520.0, "grad_norm": 2.5526843283161744, "language_loss": 0.77444106, "learning_rate": 3.91563352901428e-06, "loss": 0.79625142, "num_input_tokens_seen": 33079520, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.98046875, "step": 1545, "time_per_iteration": 2.5514965057373047 }, { "auxiliary_loss_clip": 0.01129955, "auxiliary_loss_mlp": 0.01049175, "balance_loss_clip": 1.02300811, "balance_loss_mlp": 1.03411222, "epoch": 0.09295054862468059, "flos": 17745932344320.0, "grad_norm": 2.695182505976073, "language_loss": 0.74121594, "learning_rate": 3.915524916396229e-06, "loss": 0.76300728, "num_input_tokens_seen": 33096135, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.95703125, "step": 1546, "time_per_iteration": 2.397153854370117 }, { "auxiliary_loss_clip": 0.01133425, "auxiliary_loss_mlp": 0.01045961, "balance_loss_clip": 1.02051032, "balance_loss_mlp": 1.03395939, "epoch": 0.09301067187734856, "flos": 23656937391360.0, "grad_norm": 1.798481983913879, "language_loss": 0.8445034, "learning_rate": 3.91541623541802e-06, "loss": 0.86629736, "num_input_tokens_seen": 33115245, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.9921875, "step": 1547, "time_per_iteration": 2.485987901687622 }, { "auxiliary_loss_clip": 0.01131087, "auxiliary_loss_mlp": 0.01051158, "balance_loss_clip": 1.02568305, "balance_loss_mlp": 1.03294826, "epoch": 0.09307079513001654, "flos": 27526465975680.0, "grad_norm": 2.1358068082235433, "language_loss": 0.67515683, "learning_rate": 3.9153074860835326e-06, "loss": 0.69697928, "num_input_tokens_seen": 33136640, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.98046875, "step": 1548, "time_per_iteration": 2.512816905975342 }, { "auxiliary_loss_clip": 0.01134184, "auxiliary_loss_mlp": 0.01056214, "balance_loss_clip": 1.02990484, "balance_loss_mlp": 1.0346415, "epoch": 0.0931309183826845, "flos": 20626406622720.0, "grad_norm": 1.908903974905939, "language_loss": 0.83415234, "learning_rate": 3.915198668396649e-06, "loss": 0.85605627, "num_input_tokens_seen": 33155060, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.9921875, "step": 1549, "time_per_iteration": 2.4752960205078125 }, { "auxiliary_loss_clip": 0.011351, "auxiliary_loss_mlp": 0.01042165, "balance_loss_clip": 1.01690459, "balance_loss_mlp": 1.03577685, "epoch": 0.09319104163535247, "flos": 29019701358720.0, "grad_norm": 1.6348414684861816, "language_loss": 0.75787747, "learning_rate": 3.91508978236125e-06, "loss": 0.77965015, "num_input_tokens_seen": 33175420, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.9921875, "step": 1550, "time_per_iteration": 2.4757087230682373 }, { "auxiliary_loss_clip": 0.0113549, "auxiliary_loss_mlp": 0.01054175, "balance_loss_clip": 1.02480149, "balance_loss_mlp": 1.03360713, "epoch": 0.09325116488802045, "flos": 25300368910080.0, "grad_norm": 2.818707974293908, "language_loss": 0.82972282, "learning_rate": 3.914980827981223e-06, "loss": 0.85161948, "num_input_tokens_seen": 33194120, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.015625, "step": 1551, "time_per_iteration": 2.5096378326416016 }, { "auxiliary_loss_clip": 0.01036712, "auxiliary_loss_mlp": 0.01005773, "balance_loss_clip": 1.00164795, "balance_loss_mlp": 1.00625217, "epoch": 0.09331128814068841, "flos": 61532880514560.0, "grad_norm": 0.7464859419302465, "language_loss": 0.61793554, "learning_rate": 3.914871805260456e-06, "loss": 0.63836038, "num_input_tokens_seen": 33261080, "router_z_loss_clip": 0.04125977, "router_z_loss_mlp": 0.3046875, "step": 1552, "time_per_iteration": 3.1823689937591553 }, { "auxiliary_loss_clip": 0.01035259, "auxiliary_loss_mlp": 0.01005669, "balance_loss_clip": 1.00163996, "balance_loss_mlp": 1.0053786, "epoch": 0.09337141139335638, "flos": 53290515052800.0, "grad_norm": 0.8366902008839252, "language_loss": 0.59049493, "learning_rate": 3.91476271420284e-06, "loss": 0.61090428, "num_input_tokens_seen": 33330235, "router_z_loss_clip": 0.0402832, "router_z_loss_mlp": 0.29882812, "step": 1553, "time_per_iteration": 3.234530210494995 }, { "auxiliary_loss_clip": 0.01133075, "auxiliary_loss_mlp": 0.01050295, "balance_loss_clip": 1.02307987, "balance_loss_mlp": 1.03343916, "epoch": 0.09343153464602436, "flos": 23475738101760.0, "grad_norm": 1.8482203056914184, "language_loss": 0.87292784, "learning_rate": 3.914653554812269e-06, "loss": 0.89476156, "num_input_tokens_seen": 33349035, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.99609375, "step": 1554, "time_per_iteration": 2.4784629344940186 }, { "auxiliary_loss_clip": 0.01130778, "auxiliary_loss_mlp": 0.01048842, "balance_loss_clip": 1.02203178, "balance_loss_mlp": 1.03553355, "epoch": 0.09349165789869232, "flos": 19352495600640.0, "grad_norm": 1.8549894743094775, "language_loss": 0.81752455, "learning_rate": 3.914544327092637e-06, "loss": 0.83932072, "num_input_tokens_seen": 33368060, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.953125, "step": 1555, "time_per_iteration": 3.996166944503784 }, { "auxiliary_loss_clip": 0.01034732, "auxiliary_loss_mlp": 0.01003965, "balance_loss_clip": 1.00012672, "balance_loss_mlp": 1.00511003, "epoch": 0.09355178115136029, "flos": 67499572913280.0, "grad_norm": 0.8684699997120257, "language_loss": 0.5964554, "learning_rate": 3.914435031047844e-06, "loss": 0.61684233, "num_input_tokens_seen": 33430825, "router_z_loss_clip": 0.03833008, "router_z_loss_mlp": 0.296875, "step": 1556, "time_per_iteration": 3.087555408477783 }, { "auxiliary_loss_clip": 0.01132066, "auxiliary_loss_mlp": 0.01051652, "balance_loss_clip": 1.02517581, "balance_loss_mlp": 1.03376365, "epoch": 0.09361190440402825, "flos": 37340132353920.0, "grad_norm": 1.8014126832529527, "language_loss": 0.8437897, "learning_rate": 3.9143256666817875e-06, "loss": 0.86562681, "num_input_tokens_seen": 33454855, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.98046875, "step": 1557, "time_per_iteration": 3.954483985900879 }, { "auxiliary_loss_clip": 0.01131742, "auxiliary_loss_mlp": 0.01051749, "balance_loss_clip": 1.02405715, "balance_loss_mlp": 1.03336477, "epoch": 0.09367202765669623, "flos": 24898553539200.0, "grad_norm": 1.7844932698726639, "language_loss": 0.77857816, "learning_rate": 3.914216233998373e-06, "loss": 0.80041307, "num_input_tokens_seen": 33476000, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.984375, "step": 1558, "time_per_iteration": 3.984492540359497 }, { "auxiliary_loss_clip": 0.01135984, "auxiliary_loss_mlp": 0.01048342, "balance_loss_clip": 1.02124536, "balance_loss_mlp": 1.03523898, "epoch": 0.0937321509093642, "flos": 15704665349760.0, "grad_norm": 1.858447994209009, "language_loss": 0.79866064, "learning_rate": 3.914106733001505e-06, "loss": 0.82050389, "num_input_tokens_seen": 33493845, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.0078125, "step": 1559, "time_per_iteration": 3.8980712890625 }, { "auxiliary_loss_clip": 0.01127696, "auxiliary_loss_mlp": 0.01048208, "balance_loss_clip": 1.02249503, "balance_loss_mlp": 1.03341794, "epoch": 0.09379227416203216, "flos": 20482704000000.0, "grad_norm": 3.145335706741672, "language_loss": 0.76307881, "learning_rate": 3.9139971636950914e-06, "loss": 0.78483784, "num_input_tokens_seen": 33510850, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.94140625, "step": 1560, "time_per_iteration": 2.4804866313934326 }, { "auxiliary_loss_clip": 0.01137378, "auxiliary_loss_mlp": 0.01049168, "balance_loss_clip": 1.02276349, "balance_loss_mlp": 1.03392553, "epoch": 0.09385239741470014, "flos": 24351359800320.0, "grad_norm": 1.684229005364842, "language_loss": 0.80698353, "learning_rate": 3.913887526083042e-06, "loss": 0.82884896, "num_input_tokens_seen": 33530430, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.0390625, "step": 1561, "time_per_iteration": 2.5235869884490967 }, { "auxiliary_loss_clip": 0.01130155, "auxiliary_loss_mlp": 0.01042075, "balance_loss_clip": 1.01642132, "balance_loss_mlp": 1.03278327, "epoch": 0.0939125206673681, "flos": 33290102707200.0, "grad_norm": 6.11597418955832, "language_loss": 0.61490536, "learning_rate": 3.91377782016927e-06, "loss": 0.63662767, "num_input_tokens_seen": 33551975, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.97265625, "step": 1562, "time_per_iteration": 2.5785610675811768 }, { "auxiliary_loss_clip": 0.01132565, "auxiliary_loss_mlp": 0.01051863, "balance_loss_clip": 1.02629256, "balance_loss_mlp": 1.03629041, "epoch": 0.09397264392003607, "flos": 19243915672320.0, "grad_norm": 9.651361957153787, "language_loss": 0.84796524, "learning_rate": 3.9136680459576905e-06, "loss": 0.86980951, "num_input_tokens_seen": 33569850, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.96484375, "step": 1563, "time_per_iteration": 2.445735216140747 }, { "auxiliary_loss_clip": 0.01126767, "auxiliary_loss_mlp": 0.01042546, "balance_loss_clip": 1.01910937, "balance_loss_mlp": 1.03224373, "epoch": 0.09403276717270405, "flos": 19316919058560.0, "grad_norm": 1.689177225733662, "language_loss": 0.75749022, "learning_rate": 3.913558203452221e-06, "loss": 0.77918339, "num_input_tokens_seen": 33590510, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.9453125, "step": 1564, "time_per_iteration": 2.600019931793213 }, { "auxiliary_loss_clip": 0.01131358, "auxiliary_loss_mlp": 0.01046324, "balance_loss_clip": 1.02163541, "balance_loss_mlp": 1.03432405, "epoch": 0.09409289042537201, "flos": 23582432816640.0, "grad_norm": 2.075383688901369, "language_loss": 0.80019706, "learning_rate": 3.913448292656782e-06, "loss": 0.82197386, "num_input_tokens_seen": 33608810, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.96875, "step": 1565, "time_per_iteration": 2.5562853813171387 }, { "auxiliary_loss_clip": 0.01128414, "auxiliary_loss_mlp": 0.01053288, "balance_loss_clip": 1.0280869, "balance_loss_mlp": 1.03111577, "epoch": 0.09415301367803998, "flos": 20077572049920.0, "grad_norm": 1.9569232200602484, "language_loss": 0.75231785, "learning_rate": 3.913338313575295e-06, "loss": 0.77413487, "num_input_tokens_seen": 33627265, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.97265625, "step": 1566, "time_per_iteration": 2.5157737731933594 }, { "auxiliary_loss_clip": 0.01128828, "auxiliary_loss_mlp": 0.01056516, "balance_loss_clip": 1.03018296, "balance_loss_mlp": 1.03255665, "epoch": 0.09421313693070796, "flos": 21061215095040.0, "grad_norm": 1.8935387915162705, "language_loss": 0.77399063, "learning_rate": 3.913228266211685e-06, "loss": 0.79584408, "num_input_tokens_seen": 33644810, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.9609375, "step": 1567, "time_per_iteration": 2.4394407272338867 }, { "auxiliary_loss_clip": 0.01132407, "auxiliary_loss_mlp": 0.01048987, "balance_loss_clip": 1.02391696, "balance_loss_mlp": 1.03506601, "epoch": 0.09427326018337592, "flos": 24315015208320.0, "grad_norm": 1.8373050423611277, "language_loss": 0.82380879, "learning_rate": 3.91311815056988e-06, "loss": 0.84562278, "num_input_tokens_seen": 33665665, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.97265625, "step": 1568, "time_per_iteration": 2.4735612869262695 }, { "auxiliary_loss_clip": 0.01133743, "auxiliary_loss_mlp": 0.01047137, "balance_loss_clip": 1.02000451, "balance_loss_mlp": 1.035182, "epoch": 0.09433338343604389, "flos": 20262925791360.0, "grad_norm": 3.0445244276773686, "language_loss": 0.76563734, "learning_rate": 3.9130079666538094e-06, "loss": 0.78744614, "num_input_tokens_seen": 33684760, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.984375, "step": 1569, "time_per_iteration": 2.4458184242248535 }, { "auxiliary_loss_clip": 0.01128653, "auxiliary_loss_mlp": 0.01054956, "balance_loss_clip": 1.02970767, "balance_loss_mlp": 1.03367496, "epoch": 0.09439350668871185, "flos": 12742355111040.0, "grad_norm": 2.085834786434566, "language_loss": 0.85499036, "learning_rate": 3.912897714467405e-06, "loss": 0.87682647, "num_input_tokens_seen": 33700750, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.94921875, "step": 1570, "time_per_iteration": 2.400621175765991 }, { "auxiliary_loss_clip": 0.01132324, "auxiliary_loss_mlp": 0.01047512, "balance_loss_clip": 1.02268112, "balance_loss_mlp": 1.03605151, "epoch": 0.09445362994137983, "flos": 25960960344960.0, "grad_norm": 1.7147238482541927, "language_loss": 0.76369232, "learning_rate": 3.912787394014602e-06, "loss": 0.78549063, "num_input_tokens_seen": 33724430, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.9609375, "step": 1571, "time_per_iteration": 2.503005266189575 }, { "auxiliary_loss_clip": 0.01125115, "auxiliary_loss_mlp": 0.01049007, "balance_loss_clip": 1.02462876, "balance_loss_mlp": 1.03338134, "epoch": 0.0945137531940478, "flos": 19714440332160.0, "grad_norm": 1.5826613962874685, "language_loss": 0.79275006, "learning_rate": 3.912677005299337e-06, "loss": 0.81449127, "num_input_tokens_seen": 33743455, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.91796875, "step": 1572, "time_per_iteration": 2.4463391304016113 }, { "auxiliary_loss_clip": 0.01128702, "auxiliary_loss_mlp": 0.0105154, "balance_loss_clip": 1.02736449, "balance_loss_mlp": 1.03395796, "epoch": 0.09457387644671576, "flos": 23616089233920.0, "grad_norm": 1.9490236600921087, "language_loss": 0.87449968, "learning_rate": 3.912566548325549e-06, "loss": 0.8963021, "num_input_tokens_seen": 33763435, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.94921875, "step": 1573, "time_per_iteration": 2.4512505531311035 }, { "auxiliary_loss_clip": 0.01130399, "auxiliary_loss_mlp": 0.0105662, "balance_loss_clip": 1.0299294, "balance_loss_mlp": 1.0338819, "epoch": 0.09463399969938374, "flos": 26906059382400.0, "grad_norm": 3.9008503413191, "language_loss": 0.81712437, "learning_rate": 3.912456023097182e-06, "loss": 0.83899456, "num_input_tokens_seen": 33784325, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.96484375, "step": 1574, "time_per_iteration": 2.4951813220977783 }, { "auxiliary_loss_clip": 0.01129031, "auxiliary_loss_mlp": 0.01044704, "balance_loss_clip": 1.01958644, "balance_loss_mlp": 1.03409958, "epoch": 0.0946941229520517, "flos": 23658438579840.0, "grad_norm": 1.8567349415175596, "language_loss": 0.81094515, "learning_rate": 3.912345429618178e-06, "loss": 0.83268249, "num_input_tokens_seen": 33802510, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.94921875, "step": 1575, "time_per_iteration": 2.443317174911499 }, { "auxiliary_loss_clip": 0.01126348, "auxiliary_loss_mlp": 0.01052319, "balance_loss_clip": 1.02739215, "balance_loss_mlp": 1.03227639, "epoch": 0.09475424620471967, "flos": 24132908223360.0, "grad_norm": 2.4286094261598494, "language_loss": 0.86847895, "learning_rate": 3.912234767892486e-06, "loss": 0.89026564, "num_input_tokens_seen": 33819980, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.94140625, "step": 1576, "time_per_iteration": 2.446354866027832 }, { "auxiliary_loss_clip": 0.01039027, "auxiliary_loss_mlp": 0.01006219, "balance_loss_clip": 1.00252378, "balance_loss_mlp": 1.00889707, "epoch": 0.09481436945738765, "flos": 68422815573120.0, "grad_norm": 0.9876111566020145, "language_loss": 0.65870196, "learning_rate": 3.912124037924053e-06, "loss": 0.67915446, "num_input_tokens_seen": 33878925, "router_z_loss_clip": 0.03686523, "router_z_loss_mlp": 0.30078125, "step": 1577, "time_per_iteration": 3.0272223949432373 }, { "auxiliary_loss_clip": 0.01129314, "auxiliary_loss_mlp": 0.01042959, "balance_loss_clip": 1.01878285, "balance_loss_mlp": 1.03307056, "epoch": 0.09487449271005562, "flos": 16653150789120.0, "grad_norm": 2.0004661920780817, "language_loss": 0.79035044, "learning_rate": 3.912013239716831e-06, "loss": 0.81207317, "num_input_tokens_seen": 33897600, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.9609375, "step": 1578, "time_per_iteration": 2.4237899780273438 }, { "auxiliary_loss_clip": 0.01127281, "auxiliary_loss_mlp": 0.01051388, "balance_loss_clip": 1.02628279, "balance_loss_mlp": 1.03219068, "epoch": 0.09493461596272358, "flos": 24274655809920.0, "grad_norm": 1.757623102725029, "language_loss": 0.78247732, "learning_rate": 3.911902373274776e-06, "loss": 0.80426395, "num_input_tokens_seen": 33917365, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.953125, "step": 1579, "time_per_iteration": 2.4414737224578857 }, { "auxiliary_loss_clip": 0.011302, "auxiliary_loss_mlp": 0.01053094, "balance_loss_clip": 1.02664101, "balance_loss_mlp": 1.03369892, "epoch": 0.09499473921539155, "flos": 21869139934080.0, "grad_norm": 2.1555160267596505, "language_loss": 0.72842288, "learning_rate": 3.911791438601842e-06, "loss": 0.75025582, "num_input_tokens_seen": 33936680, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.96484375, "step": 1580, "time_per_iteration": 2.4454803466796875 }, { "auxiliary_loss_clip": 0.01125442, "auxiliary_loss_mlp": 0.01051209, "balance_loss_clip": 1.02759409, "balance_loss_mlp": 1.03187084, "epoch": 0.09505486246805953, "flos": 33545736748800.0, "grad_norm": 1.9535485397853518, "language_loss": 0.77732539, "learning_rate": 3.91168043570199e-06, "loss": 0.79909194, "num_input_tokens_seen": 33960685, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.9375, "step": 1581, "time_per_iteration": 2.5470573902130127 }, { "auxiliary_loss_clip": 0.01127665, "auxiliary_loss_mlp": 0.01049812, "balance_loss_clip": 1.02551746, "balance_loss_mlp": 1.0330832, "epoch": 0.09511498572072749, "flos": 21214273962240.0, "grad_norm": 1.921177984257379, "language_loss": 0.87140906, "learning_rate": 3.911569364579181e-06, "loss": 0.89318383, "num_input_tokens_seen": 33980015, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.9453125, "step": 1582, "time_per_iteration": 2.4393773078918457 }, { "auxiliary_loss_clip": 0.0112801, "auxiliary_loss_mlp": 0.01042294, "balance_loss_clip": 1.01530528, "balance_loss_mlp": 1.03308201, "epoch": 0.09517510897339546, "flos": 14610382606080.0, "grad_norm": 1.9526238988739044, "language_loss": 0.66777384, "learning_rate": 3.9114582252373786e-06, "loss": 0.68947685, "num_input_tokens_seen": 33997705, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.94921875, "step": 1583, "time_per_iteration": 2.408489465713501 }, { "auxiliary_loss_clip": 0.01130276, "auxiliary_loss_mlp": 0.01044433, "balance_loss_clip": 1.01768279, "balance_loss_mlp": 1.03359652, "epoch": 0.09523523222606343, "flos": 27816140459520.0, "grad_norm": 2.123446702097312, "language_loss": 0.70384932, "learning_rate": 3.911347017680548e-06, "loss": 0.72559643, "num_input_tokens_seen": 34017465, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.96484375, "step": 1584, "time_per_iteration": 2.4857635498046875 }, { "auxiliary_loss_clip": 0.01128904, "auxiliary_loss_mlp": 0.01043143, "balance_loss_clip": 1.01943183, "balance_loss_mlp": 1.03342712, "epoch": 0.0952953554787314, "flos": 20705170383360.0, "grad_norm": 1.4986264749081961, "language_loss": 0.81038153, "learning_rate": 3.911235741912659e-06, "loss": 0.832102, "num_input_tokens_seen": 34038550, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.95703125, "step": 1585, "time_per_iteration": 2.4599831104278564 }, { "auxiliary_loss_clip": 0.01131759, "auxiliary_loss_mlp": 0.01052134, "balance_loss_clip": 1.02458525, "balance_loss_mlp": 1.0344888, "epoch": 0.09535547873139937, "flos": 24786552297600.0, "grad_norm": 1.7153409836079414, "language_loss": 0.71711075, "learning_rate": 3.911124397937683e-06, "loss": 0.73894966, "num_input_tokens_seen": 34058665, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.97265625, "step": 1586, "time_per_iteration": 2.5065290927886963 }, { "auxiliary_loss_clip": 0.0103869, "auxiliary_loss_mlp": 0.01004867, "balance_loss_clip": 1.00143409, "balance_loss_mlp": 1.00877166, "epoch": 0.09541560198406734, "flos": 71909208230400.0, "grad_norm": 0.8043839504118597, "language_loss": 0.5548532, "learning_rate": 3.911012985759594e-06, "loss": 0.57528877, "num_input_tokens_seen": 34109655, "router_z_loss_clip": 0.03442383, "router_z_loss_mlp": 0.29882812, "step": 1587, "time_per_iteration": 2.8876850605010986 }, { "auxiliary_loss_clip": 0.01131229, "auxiliary_loss_mlp": 0.01053932, "balance_loss_clip": 1.02865958, "balance_loss_mlp": 1.03474832, "epoch": 0.09547572523673531, "flos": 28981436641920.0, "grad_norm": 1.6962794613973096, "language_loss": 0.80978215, "learning_rate": 3.910901505382367e-06, "loss": 0.83163375, "num_input_tokens_seen": 34131115, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.96484375, "step": 1588, "time_per_iteration": 2.5049076080322266 }, { "auxiliary_loss_clip": 0.01128484, "auxiliary_loss_mlp": 0.01048779, "balance_loss_clip": 1.02406645, "balance_loss_mlp": 1.03440237, "epoch": 0.09553584848940327, "flos": 24132768577920.0, "grad_norm": 1.5583499472319169, "language_loss": 0.81693327, "learning_rate": 3.910789956809981e-06, "loss": 0.8387059, "num_input_tokens_seen": 34151925, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.94140625, "step": 1589, "time_per_iteration": 2.444581985473633 }, { "auxiliary_loss_clip": 0.01129849, "auxiliary_loss_mlp": 0.01049648, "balance_loss_clip": 1.02476883, "balance_loss_mlp": 1.0345633, "epoch": 0.09559597174207124, "flos": 42849706055040.0, "grad_norm": 1.4846773363066526, "language_loss": 0.64840114, "learning_rate": 3.910678340046415e-06, "loss": 0.67019612, "num_input_tokens_seen": 34175395, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.953125, "step": 1590, "time_per_iteration": 2.6288366317749023 }, { "auxiliary_loss_clip": 0.01127174, "auxiliary_loss_mlp": 0.01048047, "balance_loss_clip": 1.02297759, "balance_loss_mlp": 1.03421664, "epoch": 0.09565609499473922, "flos": 32669486645760.0, "grad_norm": 1.9044171830956573, "language_loss": 0.83177459, "learning_rate": 3.910566655095655e-06, "loss": 0.85352671, "num_input_tokens_seen": 34197760, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.9296875, "step": 1591, "time_per_iteration": 2.530949592590332 }, { "auxiliary_loss_clip": 0.01131455, "auxiliary_loss_mlp": 0.01058947, "balance_loss_clip": 1.03254199, "balance_loss_mlp": 1.03324676, "epoch": 0.09571621824740718, "flos": 18477432483840.0, "grad_norm": 2.665017419106436, "language_loss": 0.74207127, "learning_rate": 3.9104549019616855e-06, "loss": 0.76397526, "num_input_tokens_seen": 34215330, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.984375, "step": 1592, "time_per_iteration": 2.3857264518737793 }, { "auxiliary_loss_clip": 0.01126811, "auxiliary_loss_mlp": 0.0104743, "balance_loss_clip": 1.02271771, "balance_loss_mlp": 1.03202522, "epoch": 0.09577634150007515, "flos": 29386219478400.0, "grad_norm": 1.841806432839428, "language_loss": 0.74010116, "learning_rate": 3.910343080648495e-06, "loss": 0.76184356, "num_input_tokens_seen": 34237745, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.94921875, "step": 1593, "time_per_iteration": 2.4896321296691895 }, { "auxiliary_loss_clip": 0.01128345, "auxiliary_loss_mlp": 0.01050887, "balance_loss_clip": 1.02528095, "balance_loss_mlp": 1.03316426, "epoch": 0.09583646475274313, "flos": 22746716668800.0, "grad_norm": 1.7383322301339936, "language_loss": 0.69956505, "learning_rate": 3.910231191160074e-06, "loss": 0.72135735, "num_input_tokens_seen": 34256565, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.953125, "step": 1594, "time_per_iteration": 3.940382480621338 }, { "auxiliary_loss_clip": 0.01129306, "auxiliary_loss_mlp": 0.01044947, "balance_loss_clip": 1.01942444, "balance_loss_mlp": 1.03309989, "epoch": 0.0958965880054111, "flos": 23217346062720.0, "grad_norm": 2.301854053189111, "language_loss": 0.8258779, "learning_rate": 3.910119233500415e-06, "loss": 0.84762043, "num_input_tokens_seen": 34275970, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.96484375, "step": 1595, "time_per_iteration": 2.4429819583892822 }, { "auxiliary_loss_clip": 0.01131211, "auxiliary_loss_mlp": 0.01047621, "balance_loss_clip": 1.02132368, "balance_loss_mlp": 1.03454077, "epoch": 0.09595671125807906, "flos": 21323377560960.0, "grad_norm": 2.15645836466016, "language_loss": 0.84589171, "learning_rate": 3.910007207673514e-06, "loss": 0.86767995, "num_input_tokens_seen": 34295490, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.96484375, "step": 1596, "time_per_iteration": 3.870670795440674 }, { "auxiliary_loss_clip": 0.01132189, "auxiliary_loss_mlp": 0.0104588, "balance_loss_clip": 1.01948714, "balance_loss_mlp": 1.03406048, "epoch": 0.09601683451074704, "flos": 39601910695680.0, "grad_norm": 1.8493887818937222, "language_loss": 0.69076598, "learning_rate": 3.909895113683369e-06, "loss": 0.71254671, "num_input_tokens_seen": 34319990, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.98046875, "step": 1597, "time_per_iteration": 2.581110715866089 }, { "auxiliary_loss_clip": 0.01130091, "auxiliary_loss_mlp": 0.01040875, "balance_loss_clip": 1.01574564, "balance_loss_mlp": 1.03267622, "epoch": 0.096076957763415, "flos": 23731581611520.0, "grad_norm": 2.043402237761072, "language_loss": 0.74736744, "learning_rate": 3.9097829515339805e-06, "loss": 0.76907706, "num_input_tokens_seen": 34339225, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.97265625, "step": 1598, "time_per_iteration": 3.935260057449341 }, { "auxiliary_loss_clip": 0.01134141, "auxiliary_loss_mlp": 0.01051248, "balance_loss_clip": 1.02383006, "balance_loss_mlp": 1.0345974, "epoch": 0.09613708101608297, "flos": 34676678286720.0, "grad_norm": 1.5883270883652745, "language_loss": 0.69103479, "learning_rate": 3.909670721229351e-06, "loss": 0.71288872, "num_input_tokens_seen": 34361020, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.9921875, "step": 1599, "time_per_iteration": 2.53983998298645 }, { "auxiliary_loss_clip": 0.01129399, "auxiliary_loss_mlp": 0.01047688, "balance_loss_clip": 1.02234411, "balance_loss_mlp": 1.03334928, "epoch": 0.09619720426875093, "flos": 20739001357440.0, "grad_norm": 2.1501053197670674, "language_loss": 0.84326446, "learning_rate": 3.909558422773485e-06, "loss": 0.8650353, "num_input_tokens_seen": 34378630, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.9609375, "step": 1600, "time_per_iteration": 2.480672597885132 }, { "auxiliary_loss_clip": 0.01130237, "auxiliary_loss_mlp": 0.01051023, "balance_loss_clip": 1.0260725, "balance_loss_mlp": 1.03436995, "epoch": 0.09625732752141891, "flos": 13041874598400.0, "grad_norm": 3.115910981380097, "language_loss": 0.803262, "learning_rate": 3.909446056170392e-06, "loss": 0.82507461, "num_input_tokens_seen": 34397110, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.95703125, "step": 1601, "time_per_iteration": 2.407433032989502 }, { "auxiliary_loss_clip": 0.01133421, "auxiliary_loss_mlp": 0.01050606, "balance_loss_clip": 1.02371204, "balance_loss_mlp": 1.03521669, "epoch": 0.09631745077408688, "flos": 22272526316160.0, "grad_norm": 2.789109957056121, "language_loss": 0.82325655, "learning_rate": 3.9093336214240805e-06, "loss": 0.84509683, "num_input_tokens_seen": 34414165, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.984375, "step": 1602, "time_per_iteration": 2.4383020401000977 }, { "auxiliary_loss_clip": 0.01128537, "auxiliary_loss_mlp": 0.01051186, "balance_loss_clip": 1.02412581, "balance_loss_mlp": 1.0341289, "epoch": 0.09637757402675484, "flos": 24753105348480.0, "grad_norm": 2.012458938720864, "language_loss": 0.62555087, "learning_rate": 3.9092211185385625e-06, "loss": 0.64734805, "num_input_tokens_seen": 34434445, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.9453125, "step": 1603, "time_per_iteration": 2.452920913696289 }, { "auxiliary_loss_clip": 0.01132477, "auxiliary_loss_mlp": 0.01048033, "balance_loss_clip": 1.01939869, "balance_loss_mlp": 1.03522003, "epoch": 0.09643769727942282, "flos": 22524739044480.0, "grad_norm": 5.2125661320797105, "language_loss": 0.71050173, "learning_rate": 3.909108547517855e-06, "loss": 0.73230684, "num_input_tokens_seen": 34453095, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 0.97265625, "step": 1604, "time_per_iteration": 2.5001332759857178 }, { "auxiliary_loss_clip": 0.01129197, "auxiliary_loss_mlp": 0.01049554, "balance_loss_clip": 1.02432883, "balance_loss_mlp": 1.0339098, "epoch": 0.09649782053209079, "flos": 30919674614400.0, "grad_norm": 2.1476828999693787, "language_loss": 0.79755807, "learning_rate": 3.908995908365974e-06, "loss": 0.81934559, "num_input_tokens_seen": 34473680, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.953125, "step": 1605, "time_per_iteration": 2.516160249710083 }, { "auxiliary_loss_clip": 0.01129773, "auxiliary_loss_mlp": 0.01046575, "balance_loss_clip": 1.02044487, "balance_loss_mlp": 1.03133607, "epoch": 0.09655794378475875, "flos": 25336469122560.0, "grad_norm": 2.1327881144518552, "language_loss": 0.74646139, "learning_rate": 3.908883201086939e-06, "loss": 0.76822495, "num_input_tokens_seen": 34492610, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.984375, "step": 1606, "time_per_iteration": 2.4860920906066895 }, { "auxiliary_loss_clip": 0.01131643, "auxiliary_loss_mlp": 0.01043237, "balance_loss_clip": 1.01773846, "balance_loss_mlp": 1.03393614, "epoch": 0.09661806703742673, "flos": 22344971120640.0, "grad_norm": 1.7854779361931754, "language_loss": 0.75499034, "learning_rate": 3.908770425684774e-06, "loss": 0.77673924, "num_input_tokens_seen": 34511855, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.9765625, "step": 1607, "time_per_iteration": 2.439432382583618 }, { "auxiliary_loss_clip": 0.01129381, "auxiliary_loss_mlp": 0.0104156, "balance_loss_clip": 1.01736069, "balance_loss_mlp": 1.03235519, "epoch": 0.0966781902900947, "flos": 17456606974080.0, "grad_norm": 1.966699091936902, "language_loss": 0.86513656, "learning_rate": 3.908657582163501e-06, "loss": 0.88684595, "num_input_tokens_seen": 34528905, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.97265625, "step": 1608, "time_per_iteration": 2.420522451400757 }, { "auxiliary_loss_clip": 0.01134142, "auxiliary_loss_mlp": 0.01055626, "balance_loss_clip": 1.02823162, "balance_loss_mlp": 1.0341233, "epoch": 0.09673831354276266, "flos": 36902496061440.0, "grad_norm": 2.6149778361642504, "language_loss": 0.71525943, "learning_rate": 3.90854467052715e-06, "loss": 0.73715711, "num_input_tokens_seen": 34548480, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.0, "step": 1609, "time_per_iteration": 2.5289125442504883 }, { "auxiliary_loss_clip": 0.01130136, "auxiliary_loss_mlp": 0.01046521, "balance_loss_clip": 1.02195179, "balance_loss_mlp": 1.03249955, "epoch": 0.09679843679543064, "flos": 20700422438400.0, "grad_norm": 2.195539894108231, "language_loss": 0.84416944, "learning_rate": 3.908431690779748e-06, "loss": 0.86593604, "num_input_tokens_seen": 34565410, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.9765625, "step": 1610, "time_per_iteration": 2.4299871921539307 }, { "auxiliary_loss_clip": 0.01129956, "auxiliary_loss_mlp": 0.0104887, "balance_loss_clip": 1.02248859, "balance_loss_mlp": 1.03369439, "epoch": 0.0968585600480986, "flos": 23513269680000.0, "grad_norm": 2.117691948263346, "language_loss": 0.6709789, "learning_rate": 3.9083186429253284e-06, "loss": 0.69276714, "num_input_tokens_seen": 34584840, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.9609375, "step": 1611, "time_per_iteration": 2.4413843154907227 }, { "auxiliary_loss_clip": 0.01129039, "auxiliary_loss_mlp": 0.01048062, "balance_loss_clip": 1.02288496, "balance_loss_mlp": 1.03424144, "epoch": 0.09691868330076657, "flos": 20120026129920.0, "grad_norm": 1.79228238953497, "language_loss": 0.8106401, "learning_rate": 3.908205526967925e-06, "loss": 0.83241117, "num_input_tokens_seen": 34603360, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.9453125, "step": 1612, "time_per_iteration": 2.458664894104004 }, { "auxiliary_loss_clip": 0.01130977, "auxiliary_loss_mlp": 0.01048114, "balance_loss_clip": 1.02069569, "balance_loss_mlp": 1.03401744, "epoch": 0.09697880655343454, "flos": 16543767899520.0, "grad_norm": 2.123354943874359, "language_loss": 0.8074218, "learning_rate": 3.9080923429115755e-06, "loss": 0.82921273, "num_input_tokens_seen": 34620760, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.96875, "step": 1613, "time_per_iteration": 2.4841091632843018 }, { "auxiliary_loss_clip": 0.01131463, "auxiliary_loss_mlp": 0.01046134, "balance_loss_clip": 1.01866841, "balance_loss_mlp": 1.03485274, "epoch": 0.09703892980610251, "flos": 26102987222400.0, "grad_norm": 2.1194127715365556, "language_loss": 0.84466386, "learning_rate": 3.907979090760318e-06, "loss": 0.86643982, "num_input_tokens_seen": 34640695, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.96484375, "step": 1614, "time_per_iteration": 2.508568286895752 }, { "auxiliary_loss_clip": 0.0104157, "auxiliary_loss_mlp": 0.01004943, "balance_loss_clip": 1.00150931, "balance_loss_mlp": 1.01060295, "epoch": 0.09709905305877048, "flos": 60440273516160.0, "grad_norm": 0.7053662651999016, "language_loss": 0.54595566, "learning_rate": 3.907865770518194e-06, "loss": 0.56642085, "num_input_tokens_seen": 34702395, "router_z_loss_clip": 0.03442383, "router_z_loss_mlp": 0.30859375, "step": 1615, "time_per_iteration": 3.030308723449707 }, { "auxiliary_loss_clip": 0.01039482, "auxiliary_loss_mlp": 0.01002298, "balance_loss_clip": 0.99905533, "balance_loss_mlp": 1.00889277, "epoch": 0.09715917631143844, "flos": 57636503228160.0, "grad_norm": 0.8212475208661101, "language_loss": 0.58202291, "learning_rate": 3.9077523821892495e-06, "loss": 0.60244071, "num_input_tokens_seen": 34768910, "router_z_loss_clip": 0.0324707, "router_z_loss_mlp": 0.3046875, "step": 1616, "time_per_iteration": 3.2261805534362793 }, { "auxiliary_loss_clip": 0.01131525, "auxiliary_loss_mlp": 0.01057625, "balance_loss_clip": 1.03033817, "balance_loss_mlp": 1.03429449, "epoch": 0.09721929956410642, "flos": 20557173663360.0, "grad_norm": 1.883023529993233, "language_loss": 0.68717158, "learning_rate": 3.907638925777529e-06, "loss": 0.70906311, "num_input_tokens_seen": 34787680, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.97265625, "step": 1617, "time_per_iteration": 2.468594789505005 }, { "auxiliary_loss_clip": 0.01130611, "auxiliary_loss_mlp": 0.01056749, "balance_loss_clip": 1.02909219, "balance_loss_mlp": 1.0327003, "epoch": 0.09727942281677439, "flos": 27343137093120.0, "grad_norm": 1.8482913576706792, "language_loss": 0.80511546, "learning_rate": 3.907525401287082e-06, "loss": 0.82698905, "num_input_tokens_seen": 34808330, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.98046875, "step": 1618, "time_per_iteration": 2.519648313522339 }, { "auxiliary_loss_clip": 0.01125056, "auxiliary_loss_mlp": 0.01046007, "balance_loss_clip": 1.02254701, "balance_loss_mlp": 1.03280425, "epoch": 0.09733954606944235, "flos": 24898867741440.0, "grad_norm": 1.656348963972433, "language_loss": 0.93125695, "learning_rate": 3.907411808721961e-06, "loss": 0.95296764, "num_input_tokens_seen": 34830020, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.921875, "step": 1619, "time_per_iteration": 2.4830868244171143 }, { "auxiliary_loss_clip": 0.01126306, "auxiliary_loss_mlp": 0.01051411, "balance_loss_clip": 1.02572119, "balance_loss_mlp": 1.03497815, "epoch": 0.09739966932211033, "flos": 31502584540800.0, "grad_norm": 2.14999931966844, "language_loss": 0.88552165, "learning_rate": 3.907298148086219e-06, "loss": 0.9072988, "num_input_tokens_seen": 34850330, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.91015625, "step": 1620, "time_per_iteration": 2.555567741394043 }, { "auxiliary_loss_clip": 0.01129719, "auxiliary_loss_mlp": 0.01056381, "balance_loss_clip": 1.02976167, "balance_loss_mlp": 1.033777, "epoch": 0.0974597925747783, "flos": 23877623295360.0, "grad_norm": 1.937261380343017, "language_loss": 0.77111161, "learning_rate": 3.907184419383912e-06, "loss": 0.79297262, "num_input_tokens_seen": 34871640, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.9609375, "step": 1621, "time_per_iteration": 2.5151984691619873 }, { "auxiliary_loss_clip": 0.01128858, "auxiliary_loss_mlp": 0.01066381, "balance_loss_clip": 1.04022598, "balance_loss_mlp": 1.0327791, "epoch": 0.09751991582744626, "flos": 17018621568000.0, "grad_norm": 1.9740811110808778, "language_loss": 0.77910846, "learning_rate": 3.907070622619099e-06, "loss": 0.80106086, "num_input_tokens_seen": 34888100, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.9609375, "step": 1622, "time_per_iteration": 2.4174482822418213 }, { "auxiliary_loss_clip": 0.01130166, "auxiliary_loss_mlp": 0.01057601, "balance_loss_clip": 1.02941966, "balance_loss_mlp": 1.03194284, "epoch": 0.09758003908011423, "flos": 28401564003840.0, "grad_norm": 4.7039111582580535, "language_loss": 0.85681069, "learning_rate": 3.906956757795841e-06, "loss": 0.87868834, "num_input_tokens_seen": 34910485, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.984375, "step": 1623, "time_per_iteration": 2.5555121898651123 }, { "auxiliary_loss_clip": 0.01127703, "auxiliary_loss_mlp": 0.01053942, "balance_loss_clip": 1.02783489, "balance_loss_mlp": 1.03434443, "epoch": 0.09764016233278221, "flos": 18143488529280.0, "grad_norm": 2.193997572133753, "language_loss": 0.79843217, "learning_rate": 3.906842824918201e-06, "loss": 0.8202486, "num_input_tokens_seen": 34928615, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.93359375, "step": 1624, "time_per_iteration": 2.418097496032715 }, { "auxiliary_loss_clip": 0.01127799, "auxiliary_loss_mlp": 0.01048748, "balance_loss_clip": 1.02371407, "balance_loss_mlp": 1.03221011, "epoch": 0.09770028558545017, "flos": 15265004198400.0, "grad_norm": 2.217296861129646, "language_loss": 0.8578465, "learning_rate": 3.906728823990246e-06, "loss": 0.87961197, "num_input_tokens_seen": 34946045, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.95703125, "step": 1625, "time_per_iteration": 2.4446890354156494 }, { "auxiliary_loss_clip": 0.01130691, "auxiliary_loss_mlp": 0.01056346, "balance_loss_clip": 1.03190827, "balance_loss_mlp": 1.03437328, "epoch": 0.09776040883811814, "flos": 23471444004480.0, "grad_norm": 2.178532157941631, "language_loss": 0.85360849, "learning_rate": 3.906614755016044e-06, "loss": 0.87547886, "num_input_tokens_seen": 34962865, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.96484375, "step": 1626, "time_per_iteration": 2.485790967941284 }, { "auxiliary_loss_clip": 0.01134409, "auxiliary_loss_mlp": 0.01051877, "balance_loss_clip": 1.02445865, "balance_loss_mlp": 1.03664041, "epoch": 0.09782053209078612, "flos": 24498309179520.0, "grad_norm": 2.56867210436732, "language_loss": 0.83493525, "learning_rate": 3.9065006179996655e-06, "loss": 0.85679817, "num_input_tokens_seen": 34983505, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.9765625, "step": 1627, "time_per_iteration": 2.505154609680176 }, { "auxiliary_loss_clip": 0.01125757, "auxiliary_loss_mlp": 0.01049378, "balance_loss_clip": 1.02329516, "balance_loss_mlp": 1.03237844, "epoch": 0.09788065534345408, "flos": 21579081425280.0, "grad_norm": 2.4486454381811202, "language_loss": 0.8416543, "learning_rate": 3.906386412945184e-06, "loss": 0.8634057, "num_input_tokens_seen": 35001825, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.9296875, "step": 1628, "time_per_iteration": 2.4483213424682617 }, { "auxiliary_loss_clip": 0.01126192, "auxiliary_loss_mlp": 0.01044646, "balance_loss_clip": 1.01932597, "balance_loss_mlp": 1.03139532, "epoch": 0.09794077859612205, "flos": 23841313614720.0, "grad_norm": 1.6683207470752828, "language_loss": 0.75619781, "learning_rate": 3.906272139856676e-06, "loss": 0.77790618, "num_input_tokens_seen": 35023075, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.9453125, "step": 1629, "time_per_iteration": 2.4707629680633545 }, { "auxiliary_loss_clip": 0.011295, "auxiliary_loss_mlp": 0.01048848, "balance_loss_clip": 1.02209711, "balance_loss_mlp": 1.03475523, "epoch": 0.09800090184879003, "flos": 23658752782080.0, "grad_norm": 1.8837128082629686, "language_loss": 0.78327549, "learning_rate": 3.906157798738218e-06, "loss": 0.80505896, "num_input_tokens_seen": 35043480, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.9453125, "step": 1630, "time_per_iteration": 2.4542155265808105 }, { "auxiliary_loss_clip": 0.01128359, "auxiliary_loss_mlp": 0.01050031, "balance_loss_clip": 1.02196896, "balance_loss_mlp": 1.03394079, "epoch": 0.09806102510145799, "flos": 17054826514560.0, "grad_norm": 2.2129601684385456, "language_loss": 0.86369681, "learning_rate": 3.906043389593892e-06, "loss": 0.8854807, "num_input_tokens_seen": 35061490, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 0.9453125, "step": 1631, "time_per_iteration": 2.3962056636810303 }, { "auxiliary_loss_clip": 0.01126007, "auxiliary_loss_mlp": 0.01048049, "balance_loss_clip": 1.0227648, "balance_loss_mlp": 1.03338003, "epoch": 0.09812114835412596, "flos": 23877344004480.0, "grad_norm": 2.0547621584516267, "language_loss": 0.83182806, "learning_rate": 3.9059289124277804e-06, "loss": 0.85356867, "num_input_tokens_seen": 35079670, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.92578125, "step": 1632, "time_per_iteration": 2.4445884227752686 }, { "auxiliary_loss_clip": 0.01129925, "auxiliary_loss_mlp": 0.01054484, "balance_loss_clip": 1.02919996, "balance_loss_mlp": 1.03536582, "epoch": 0.09818127160679392, "flos": 20594425950720.0, "grad_norm": 1.9895039626173088, "language_loss": 0.78635532, "learning_rate": 3.9058143672439684e-06, "loss": 0.80819941, "num_input_tokens_seen": 35099205, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.9453125, "step": 1633, "time_per_iteration": 2.442203998565674 }, { "auxiliary_loss_clip": 0.01124605, "auxiliary_loss_mlp": 0.01049381, "balance_loss_clip": 1.02381027, "balance_loss_mlp": 1.03329194, "epoch": 0.0982413948594619, "flos": 15486423240960.0, "grad_norm": 2.3402569957392636, "language_loss": 0.73614502, "learning_rate": 3.905699754046544e-06, "loss": 0.75788486, "num_input_tokens_seen": 35115270, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.9140625, "step": 1634, "time_per_iteration": 3.9503259658813477 }, { "auxiliary_loss_clip": 0.01130281, "auxiliary_loss_mlp": 0.01056121, "balance_loss_clip": 1.02904904, "balance_loss_mlp": 1.03145909, "epoch": 0.09830151811212987, "flos": 24206784393600.0, "grad_norm": 2.5834969716202623, "language_loss": 0.72645545, "learning_rate": 3.905585072839597e-06, "loss": 0.74831951, "num_input_tokens_seen": 35134065, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.98828125, "step": 1635, "time_per_iteration": 2.446293592453003 }, { "auxiliary_loss_clip": 0.01132306, "auxiliary_loss_mlp": 0.01046668, "balance_loss_clip": 1.0192616, "balance_loss_mlp": 1.03499234, "epoch": 0.09836164136479783, "flos": 20593553166720.0, "grad_norm": 2.563211383861435, "language_loss": 0.78225213, "learning_rate": 3.905470323627221e-06, "loss": 0.80404186, "num_input_tokens_seen": 35154870, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.97265625, "step": 1636, "time_per_iteration": 3.8364906311035156 }, { "auxiliary_loss_clip": 0.01128111, "auxiliary_loss_mlp": 0.01056342, "balance_loss_clip": 1.03126025, "balance_loss_mlp": 1.03416944, "epoch": 0.09842176461746581, "flos": 19933241022720.0, "grad_norm": 1.9148254897281238, "language_loss": 0.69535017, "learning_rate": 3.9053555064135106e-06, "loss": 0.71719474, "num_input_tokens_seen": 35171850, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.9375, "step": 1637, "time_per_iteration": 2.4151923656463623 }, { "auxiliary_loss_clip": 0.01127765, "auxiliary_loss_mlp": 0.01053649, "balance_loss_clip": 1.02756572, "balance_loss_mlp": 1.03222847, "epoch": 0.09848188787013377, "flos": 21213610646400.0, "grad_norm": 2.1697528708543414, "language_loss": 0.7735889, "learning_rate": 3.905240621202563e-06, "loss": 0.79540306, "num_input_tokens_seen": 35188795, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.953125, "step": 1638, "time_per_iteration": 3.8829591274261475 }, { "auxiliary_loss_clip": 0.01125303, "auxiliary_loss_mlp": 0.01041223, "balance_loss_clip": 1.01531875, "balance_loss_mlp": 1.03256583, "epoch": 0.09854201112280174, "flos": 30152912135040.0, "grad_norm": 1.5071899996243445, "language_loss": 0.72347581, "learning_rate": 3.905125667998478e-06, "loss": 0.74514115, "num_input_tokens_seen": 35212100, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.92578125, "step": 1639, "time_per_iteration": 2.500878095626831 }, { "auxiliary_loss_clip": 0.01127181, "auxiliary_loss_mlp": 0.01039899, "balance_loss_clip": 1.01400685, "balance_loss_mlp": 1.03245223, "epoch": 0.09860213437546972, "flos": 21794740093440.0, "grad_norm": 1.704259235748373, "language_loss": 0.88319802, "learning_rate": 3.90501064680536e-06, "loss": 0.90486884, "num_input_tokens_seen": 35230390, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.9453125, "step": 1640, "time_per_iteration": 2.4600088596343994 }, { "auxiliary_loss_clip": 0.0113121, "auxiliary_loss_mlp": 0.01042793, "balance_loss_clip": 1.01701999, "balance_loss_mlp": 1.03482461, "epoch": 0.09866225762813768, "flos": 21834471087360.0, "grad_norm": 2.5186735761485917, "language_loss": 0.80387259, "learning_rate": 3.904895557627311e-06, "loss": 0.82561255, "num_input_tokens_seen": 35250405, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.96484375, "step": 1641, "time_per_iteration": 2.4450650215148926 }, { "auxiliary_loss_clip": 0.01128525, "auxiliary_loss_mlp": 0.01048598, "balance_loss_clip": 1.02246714, "balance_loss_mlp": 1.03371489, "epoch": 0.09872238088080565, "flos": 17598982965120.0, "grad_norm": 2.5171415456479145, "language_loss": 0.86056006, "learning_rate": 3.90478040046844e-06, "loss": 0.88233137, "num_input_tokens_seen": 35262820, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.9453125, "step": 1642, "time_per_iteration": 2.3926727771759033 }, { "auxiliary_loss_clip": 0.01130981, "auxiliary_loss_mlp": 0.01045888, "balance_loss_clip": 1.02081847, "balance_loss_mlp": 1.03485167, "epoch": 0.09878250413347361, "flos": 27634906258560.0, "grad_norm": 1.5784985955077508, "language_loss": 0.80769372, "learning_rate": 3.9046651753328565e-06, "loss": 0.82946241, "num_input_tokens_seen": 35284490, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.9609375, "step": 1643, "time_per_iteration": 2.501354694366455 }, { "auxiliary_loss_clip": 0.01126589, "auxiliary_loss_mlp": 0.01054205, "balance_loss_clip": 1.0288372, "balance_loss_mlp": 1.03275657, "epoch": 0.0988426273861416, "flos": 16543802810880.0, "grad_norm": 1.9392785792078961, "language_loss": 0.82399493, "learning_rate": 3.904549882224672e-06, "loss": 0.8458029, "num_input_tokens_seen": 35302815, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.9375, "step": 1644, "time_per_iteration": 2.427469491958618 }, { "auxiliary_loss_clip": 0.01124519, "auxiliary_loss_mlp": 0.01045704, "balance_loss_clip": 1.02088428, "balance_loss_mlp": 1.03299594, "epoch": 0.09890275063880956, "flos": 21214204139520.0, "grad_norm": 1.8836345415938323, "language_loss": 0.68441319, "learning_rate": 3.904434521148001e-06, "loss": 0.70611537, "num_input_tokens_seen": 35321175, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.91796875, "step": 1645, "time_per_iteration": 2.4562041759490967 }, { "auxiliary_loss_clip": 0.01037503, "auxiliary_loss_mlp": 0.01022829, "balance_loss_clip": 1.01913321, "balance_loss_mlp": 1.01097751, "epoch": 0.09896287389147752, "flos": 59376225876480.0, "grad_norm": 0.857572721094008, "language_loss": 0.60793686, "learning_rate": 3.904319092106961e-06, "loss": 0.62854016, "num_input_tokens_seen": 35381740, "router_z_loss_clip": 0.03686523, "router_z_loss_mlp": 0.265625, "step": 1646, "time_per_iteration": 3.085836172103882 }, { "auxiliary_loss_clip": 0.01124927, "auxiliary_loss_mlp": 0.01050674, "balance_loss_clip": 1.02424479, "balance_loss_mlp": 1.03245282, "epoch": 0.0990229971441455, "flos": 29641399672320.0, "grad_norm": 2.1236808177994075, "language_loss": 0.73563886, "learning_rate": 3.904203595105671e-06, "loss": 0.75739485, "num_input_tokens_seen": 35403760, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.92578125, "step": 1647, "time_per_iteration": 2.485992670059204 }, { "auxiliary_loss_clip": 0.01126761, "auxiliary_loss_mlp": 0.01049389, "balance_loss_clip": 1.02456927, "balance_loss_mlp": 1.03380871, "epoch": 0.09908312039681347, "flos": 21833807771520.0, "grad_norm": 2.0031847019009117, "language_loss": 0.84025264, "learning_rate": 3.904088030148253e-06, "loss": 0.86201417, "num_input_tokens_seen": 35424050, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.9296875, "step": 1648, "time_per_iteration": 2.454493522644043 }, { "auxiliary_loss_clip": 0.01034882, "auxiliary_loss_mlp": 0.01007785, "balance_loss_clip": 1.00447071, "balance_loss_mlp": 1.00925016, "epoch": 0.09914324364948143, "flos": 57560951312640.0, "grad_norm": 0.7264387503758062, "language_loss": 0.5566957, "learning_rate": 3.90397239723883e-06, "loss": 0.57712233, "num_input_tokens_seen": 35481690, "router_z_loss_clip": 0.03320312, "router_z_loss_mlp": 0.2578125, "step": 1649, "time_per_iteration": 3.019339084625244 }, { "auxiliary_loss_clip": 0.01120967, "auxiliary_loss_mlp": 0.01050197, "balance_loss_clip": 1.0244596, "balance_loss_mlp": 1.03003716, "epoch": 0.09920336690214941, "flos": 34122711744000.0, "grad_norm": 2.1078890210501404, "language_loss": 0.89719647, "learning_rate": 3.903856696381531e-06, "loss": 0.91890812, "num_input_tokens_seen": 35498635, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.91015625, "step": 1650, "time_per_iteration": 2.5014076232910156 }, { "auxiliary_loss_clip": 0.01032729, "auxiliary_loss_mlp": 0.01006617, "balance_loss_clip": 1.0029211, "balance_loss_mlp": 1.00755525, "epoch": 0.09926349015481738, "flos": 71212514716800.0, "grad_norm": 0.7942200012221744, "language_loss": 0.63744354, "learning_rate": 3.903740927580484e-06, "loss": 0.65783697, "num_input_tokens_seen": 35565720, "router_z_loss_clip": 0.03686523, "router_z_loss_mlp": 0.25195312, "step": 1651, "time_per_iteration": 3.259958505630493 }, { "auxiliary_loss_clip": 0.01123818, "auxiliary_loss_mlp": 0.01055441, "balance_loss_clip": 1.03074133, "balance_loss_mlp": 1.03246665, "epoch": 0.09932361340748534, "flos": 23147589432960.0, "grad_norm": 2.3108050741700272, "language_loss": 0.8803277, "learning_rate": 3.90362509083982e-06, "loss": 0.90212035, "num_input_tokens_seen": 35586000, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.9140625, "step": 1652, "time_per_iteration": 2.527146816253662 }, { "auxiliary_loss_clip": 0.01128132, "auxiliary_loss_mlp": 0.01048056, "balance_loss_clip": 1.02420235, "balance_loss_mlp": 1.03502429, "epoch": 0.09938373666015332, "flos": 19827628560000.0, "grad_norm": 2.0249951043498418, "language_loss": 0.82159197, "learning_rate": 3.903509186163673e-06, "loss": 0.84335381, "num_input_tokens_seen": 35604355, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.9296875, "step": 1653, "time_per_iteration": 2.4585115909576416 }, { "auxiliary_loss_clip": 0.01127948, "auxiliary_loss_mlp": 0.01055208, "balance_loss_clip": 1.02905321, "balance_loss_mlp": 1.0347662, "epoch": 0.09944385991282129, "flos": 22089581458560.0, "grad_norm": 2.025344607574988, "language_loss": 0.79414368, "learning_rate": 3.903393213556179e-06, "loss": 0.81597531, "num_input_tokens_seen": 35625495, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.9296875, "step": 1654, "time_per_iteration": 2.5175716876983643 }, { "auxiliary_loss_clip": 0.01127728, "auxiliary_loss_mlp": 0.01055725, "balance_loss_clip": 1.03126287, "balance_loss_mlp": 1.03671968, "epoch": 0.09950398316548925, "flos": 19827838028160.0, "grad_norm": 1.631934643293413, "language_loss": 0.81203735, "learning_rate": 3.903277173021479e-06, "loss": 0.83387184, "num_input_tokens_seen": 35645030, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.91015625, "step": 1655, "time_per_iteration": 2.5070693492889404 }, { "auxiliary_loss_clip": 0.01122712, "auxiliary_loss_mlp": 0.01046783, "balance_loss_clip": 1.02300107, "balance_loss_mlp": 1.03238094, "epoch": 0.09956410641815722, "flos": 25002699724800.0, "grad_norm": 1.8733174755885336, "language_loss": 0.80317938, "learning_rate": 3.903161064563712e-06, "loss": 0.8248744, "num_input_tokens_seen": 35664305, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.90234375, "step": 1656, "time_per_iteration": 2.480952739715576 }, { "auxiliary_loss_clip": 0.01129665, "auxiliary_loss_mlp": 0.01052113, "balance_loss_clip": 1.02768731, "balance_loss_mlp": 1.03797841, "epoch": 0.0996242296708252, "flos": 19316709590400.0, "grad_norm": 1.677361575413214, "language_loss": 0.88713956, "learning_rate": 3.9030448881870206e-06, "loss": 0.90895736, "num_input_tokens_seen": 35684060, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.91796875, "step": 1657, "time_per_iteration": 2.445462465286255 }, { "auxiliary_loss_clip": 0.01131792, "auxiliary_loss_mlp": 0.01045285, "balance_loss_clip": 1.01911855, "balance_loss_mlp": 1.03498387, "epoch": 0.09968435292349316, "flos": 21870536388480.0, "grad_norm": 2.4750170046506597, "language_loss": 0.84711289, "learning_rate": 3.902928643895554e-06, "loss": 0.86888373, "num_input_tokens_seen": 35703250, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.96875, "step": 1658, "time_per_iteration": 2.4258053302764893 }, { "auxiliary_loss_clip": 0.01031565, "auxiliary_loss_mlp": 0.01005096, "balance_loss_clip": 1.00178158, "balance_loss_mlp": 1.00624537, "epoch": 0.09974447617616113, "flos": 65381636839680.0, "grad_norm": 0.9018418713282724, "language_loss": 0.60856706, "learning_rate": 3.9028123316934575e-06, "loss": 0.62893367, "num_input_tokens_seen": 35762165, "router_z_loss_clip": 0.03320312, "router_z_loss_mlp": 0.25390625, "step": 1659, "time_per_iteration": 3.0654025077819824 }, { "auxiliary_loss_clip": 0.01127454, "auxiliary_loss_mlp": 0.01047572, "balance_loss_clip": 1.02435017, "balance_loss_mlp": 1.03469169, "epoch": 0.0998045994288291, "flos": 23658682959360.0, "grad_norm": 4.848363324112766, "language_loss": 0.85086519, "learning_rate": 3.902695951584885e-06, "loss": 0.87261546, "num_input_tokens_seen": 35781520, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.92578125, "step": 1660, "time_per_iteration": 2.453413963317871 }, { "auxiliary_loss_clip": 0.0112804, "auxiliary_loss_mlp": 0.01046614, "balance_loss_clip": 1.02045989, "balance_loss_mlp": 1.03721237, "epoch": 0.09986472268149707, "flos": 19608688224000.0, "grad_norm": 1.9418971738798911, "language_loss": 0.80042166, "learning_rate": 3.902579503573987e-06, "loss": 0.82216817, "num_input_tokens_seen": 35799565, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.90625, "step": 1661, "time_per_iteration": 2.4215495586395264 }, { "auxiliary_loss_clip": 0.01128891, "auxiliary_loss_mlp": 0.01046257, "balance_loss_clip": 1.01982832, "balance_loss_mlp": 1.03200734, "epoch": 0.09992484593416504, "flos": 26212125732480.0, "grad_norm": 1.8201023566804326, "language_loss": 0.83474773, "learning_rate": 3.902462987664922e-06, "loss": 0.85649925, "num_input_tokens_seen": 35821085, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.96875, "step": 1662, "time_per_iteration": 2.4538283348083496 }, { "auxiliary_loss_clip": 0.01127811, "auxiliary_loss_mlp": 0.01049377, "balance_loss_clip": 1.02310371, "balance_loss_mlp": 1.03647661, "epoch": 0.09998496918683301, "flos": 17492672275200.0, "grad_norm": 2.0821206111460366, "language_loss": 0.88856053, "learning_rate": 3.902346403861846e-06, "loss": 0.91033244, "num_input_tokens_seen": 35839840, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.9140625, "step": 1663, "time_per_iteration": 2.412142753601074 }, { "auxiliary_loss_clip": 0.01128487, "auxiliary_loss_mlp": 0.01050009, "balance_loss_clip": 1.02400947, "balance_loss_mlp": 1.03475738, "epoch": 0.10004509243950098, "flos": 22783794399360.0, "grad_norm": 1.6838586642707083, "language_loss": 0.70417583, "learning_rate": 3.9022297521689196e-06, "loss": 0.72596073, "num_input_tokens_seen": 35861545, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.9375, "step": 1664, "time_per_iteration": 2.4572854042053223 }, { "auxiliary_loss_clip": 0.0112979, "auxiliary_loss_mlp": 0.01051836, "balance_loss_clip": 1.02657557, "balance_loss_mlp": 1.03681624, "epoch": 0.10010521569216894, "flos": 16252452581760.0, "grad_norm": 2.3885815761458833, "language_loss": 0.78945351, "learning_rate": 3.902113032590307e-06, "loss": 0.8112697, "num_input_tokens_seen": 35878295, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.9296875, "step": 1665, "time_per_iteration": 2.4018146991729736 }, { "auxiliary_loss_clip": 0.01133934, "auxiliary_loss_mlp": 0.0106273, "balance_loss_clip": 1.03639627, "balance_loss_mlp": 1.03929853, "epoch": 0.10016533894483691, "flos": 23401512817920.0, "grad_norm": 1.7453432919123004, "language_loss": 0.70129985, "learning_rate": 3.901996245130174e-06, "loss": 0.72326648, "num_input_tokens_seen": 35898990, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.9453125, "step": 1666, "time_per_iteration": 2.441276788711548 }, { "auxiliary_loss_clip": 0.01128306, "auxiliary_loss_mlp": 0.0106182, "balance_loss_clip": 1.03455698, "balance_loss_mlp": 1.03533638, "epoch": 0.10022546219750489, "flos": 19353158916480.0, "grad_norm": 2.1816995475096856, "language_loss": 0.78218007, "learning_rate": 3.901879389792686e-06, "loss": 0.80408126, "num_input_tokens_seen": 35916225, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.9296875, "step": 1667, "time_per_iteration": 2.3924455642700195 }, { "auxiliary_loss_clip": 0.01128903, "auxiliary_loss_mlp": 0.01055359, "balance_loss_clip": 1.02883434, "balance_loss_mlp": 1.03445554, "epoch": 0.10028558545017285, "flos": 27084640320000.0, "grad_norm": 2.5698885473644046, "language_loss": 0.77251256, "learning_rate": 3.9017624665820155e-06, "loss": 0.79435515, "num_input_tokens_seen": 35934630, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.9453125, "step": 1668, "time_per_iteration": 2.4461889266967773 }, { "auxiliary_loss_clip": 0.01126996, "auxiliary_loss_mlp": 0.01049487, "balance_loss_clip": 1.02247405, "balance_loss_mlp": 1.03288484, "epoch": 0.10034570870284082, "flos": 25845991637760.0, "grad_norm": 2.261072014975546, "language_loss": 0.78254324, "learning_rate": 3.901645475502334e-06, "loss": 0.80430806, "num_input_tokens_seen": 35953855, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.94140625, "step": 1669, "time_per_iteration": 2.4449715614318848 }, { "auxiliary_loss_clip": 0.01131728, "auxiliary_loss_mlp": 0.01064008, "balance_loss_clip": 1.03719783, "balance_loss_mlp": 1.0359478, "epoch": 0.1004058319555088, "flos": 26248400501760.0, "grad_norm": 3.2733356873499346, "language_loss": 0.85289216, "learning_rate": 3.901528416557817e-06, "loss": 0.87484956, "num_input_tokens_seen": 35974555, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.9609375, "step": 1670, "time_per_iteration": 2.451265335083008 }, { "auxiliary_loss_clip": 0.01121068, "auxiliary_loss_mlp": 0.01044712, "balance_loss_clip": 1.02132273, "balance_loss_mlp": 1.03208125, "epoch": 0.10046595520817676, "flos": 25373302473600.0, "grad_norm": 1.5981438708977425, "language_loss": 0.77034068, "learning_rate": 3.901411289752643e-06, "loss": 0.79199851, "num_input_tokens_seen": 35996830, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.890625, "step": 1671, "time_per_iteration": 2.4486119747161865 }, { "auxiliary_loss_clip": 0.01029094, "auxiliary_loss_mlp": 0.01005826, "balance_loss_clip": 1.00208318, "balance_loss_mlp": 1.00429285, "epoch": 0.10052607846084473, "flos": 67458934224000.0, "grad_norm": 0.7714351547673884, "language_loss": 0.60759377, "learning_rate": 3.901294095090991e-06, "loss": 0.62794292, "num_input_tokens_seen": 36054465, "router_z_loss_clip": 0.03735352, "router_z_loss_mlp": 0.24804688, "step": 1672, "time_per_iteration": 3.0407564640045166 }, { "auxiliary_loss_clip": 0.01131427, "auxiliary_loss_mlp": 0.01055055, "balance_loss_clip": 1.02826881, "balance_loss_mlp": 1.03504014, "epoch": 0.10058620171351271, "flos": 21359442862080.0, "grad_norm": 2.0496448377842214, "language_loss": 0.76836884, "learning_rate": 3.901176832577043e-06, "loss": 0.79023367, "num_input_tokens_seen": 36073480, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.96484375, "step": 1673, "time_per_iteration": 3.8078036308288574 }, { "auxiliary_loss_clip": 0.01125647, "auxiliary_loss_mlp": 0.01041383, "balance_loss_clip": 1.01638508, "balance_loss_mlp": 1.03168154, "epoch": 0.10064632496618067, "flos": 16799192472960.0, "grad_norm": 2.1322106564210506, "language_loss": 0.73229301, "learning_rate": 3.901059502214984e-06, "loss": 0.75396329, "num_input_tokens_seen": 36091830, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.94140625, "step": 1674, "time_per_iteration": 2.386910915374756 }, { "auxiliary_loss_clip": 0.0112982, "auxiliary_loss_mlp": 0.01044697, "balance_loss_clip": 1.0194962, "balance_loss_mlp": 1.03423774, "epoch": 0.10070644821884864, "flos": 23623280974080.0, "grad_norm": 2.1319658939344626, "language_loss": 0.79347897, "learning_rate": 3.900942104009003e-06, "loss": 0.81522405, "num_input_tokens_seen": 36111400, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.953125, "step": 1675, "time_per_iteration": 3.822953939437866 }, { "auxiliary_loss_clip": 0.01125259, "auxiliary_loss_mlp": 0.01056213, "balance_loss_clip": 1.03017747, "balance_loss_mlp": 1.03400826, "epoch": 0.1007665714715166, "flos": 24461406005760.0, "grad_norm": 2.608486034898942, "language_loss": 0.81366646, "learning_rate": 3.900824637963287e-06, "loss": 0.83548117, "num_input_tokens_seen": 36129345, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.9140625, "step": 1676, "time_per_iteration": 2.427243232727051 }, { "auxiliary_loss_clip": 0.01130224, "auxiliary_loss_mlp": 0.01053137, "balance_loss_clip": 1.02775776, "balance_loss_mlp": 1.03403139, "epoch": 0.10082669472418458, "flos": 16798214954880.0, "grad_norm": 2.002097366993846, "language_loss": 0.8618263, "learning_rate": 3.9007071040820285e-06, "loss": 0.88365984, "num_input_tokens_seen": 36146255, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.96484375, "step": 1677, "time_per_iteration": 3.9178919792175293 }, { "auxiliary_loss_clip": 0.01127388, "auxiliary_loss_mlp": 0.01050704, "balance_loss_clip": 1.02513361, "balance_loss_mlp": 1.03335094, "epoch": 0.10088681797685255, "flos": 13552653922560.0, "grad_norm": 2.024853065057127, "language_loss": 0.85943526, "learning_rate": 3.900589502369423e-06, "loss": 0.88121617, "num_input_tokens_seen": 36164050, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.94140625, "step": 1678, "time_per_iteration": 2.389723062515259 }, { "auxiliary_loss_clip": 0.0113194, "auxiliary_loss_mlp": 0.0104625, "balance_loss_clip": 1.01992822, "balance_loss_mlp": 1.03805208, "epoch": 0.10094694122952051, "flos": 25264513077120.0, "grad_norm": 2.1347749920294357, "language_loss": 0.89958286, "learning_rate": 3.9004718328296676e-06, "loss": 0.92136478, "num_input_tokens_seen": 36183530, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.94140625, "step": 1679, "time_per_iteration": 2.444307565689087 }, { "auxiliary_loss_clip": 0.01029359, "auxiliary_loss_mlp": 0.01012468, "balance_loss_clip": 1.00867677, "balance_loss_mlp": 1.0046916, "epoch": 0.10100706448218849, "flos": 69850762980480.0, "grad_norm": 0.7774210750054823, "language_loss": 0.52998149, "learning_rate": 3.900354095466962e-06, "loss": 0.55039978, "num_input_tokens_seen": 36248550, "router_z_loss_clip": 0.0378418, "router_z_loss_mlp": 0.24707031, "step": 1680, "time_per_iteration": 3.087531328201294 }, { "auxiliary_loss_clip": 0.01125771, "auxiliary_loss_mlp": 0.0104457, "balance_loss_clip": 1.01921439, "balance_loss_mlp": 1.03489673, "epoch": 0.10106718773485646, "flos": 20006244408960.0, "grad_norm": 1.7655943226966269, "language_loss": 0.76840341, "learning_rate": 3.900236290285506e-06, "loss": 0.79010677, "num_input_tokens_seen": 36266065, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.91015625, "step": 1681, "time_per_iteration": 2.3962461948394775 }, { "auxiliary_loss_clip": 0.01132826, "auxiliary_loss_mlp": 0.01058513, "balance_loss_clip": 1.02968824, "balance_loss_mlp": 1.03300309, "epoch": 0.10112731098752442, "flos": 13478987220480.0, "grad_norm": 2.3429988295473376, "language_loss": 0.93676221, "learning_rate": 3.900118417289504e-06, "loss": 0.95867562, "num_input_tokens_seen": 36280960, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.0, "step": 1682, "time_per_iteration": 2.3901689052581787 }, { "auxiliary_loss_clip": 0.01129013, "auxiliary_loss_mlp": 0.01044731, "balance_loss_clip": 1.01974475, "balance_loss_mlp": 1.03455329, "epoch": 0.1011874342401924, "flos": 18514894239360.0, "grad_norm": 2.5183537344737164, "language_loss": 0.87868714, "learning_rate": 3.900000476483164e-06, "loss": 0.9004246, "num_input_tokens_seen": 36299010, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.9453125, "step": 1683, "time_per_iteration": 2.381648302078247 }, { "auxiliary_loss_clip": 0.01128587, "auxiliary_loss_mlp": 0.01050638, "balance_loss_clip": 1.02581882, "balance_loss_mlp": 1.03547812, "epoch": 0.10124755749286037, "flos": 20701853804160.0, "grad_norm": 1.7995433586265865, "language_loss": 0.7452631, "learning_rate": 3.8998824678706946e-06, "loss": 0.76705539, "num_input_tokens_seen": 36318400, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.93359375, "step": 1684, "time_per_iteration": 2.424262523651123 }, { "auxiliary_loss_clip": 0.01026459, "auxiliary_loss_mlp": 0.01004289, "balance_loss_clip": 1.00049782, "balance_loss_mlp": 1.00221205, "epoch": 0.10130768074552833, "flos": 56106015557760.0, "grad_norm": 0.7860492151257247, "language_loss": 0.61080587, "learning_rate": 3.899764391456306e-06, "loss": 0.63111335, "num_input_tokens_seen": 36381815, "router_z_loss_clip": 0.0378418, "router_z_loss_mlp": 0.2421875, "step": 1685, "time_per_iteration": 3.1197712421417236 }, { "auxiliary_loss_clip": 0.01127716, "auxiliary_loss_mlp": 0.01050595, "balance_loss_clip": 1.02488136, "balance_loss_mlp": 1.03425086, "epoch": 0.1013678039981963, "flos": 33400916962560.0, "grad_norm": 2.7186454758616514, "language_loss": 0.61819071, "learning_rate": 3.8996462472442145e-06, "loss": 0.63997382, "num_input_tokens_seen": 36404320, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.9375, "step": 1686, "time_per_iteration": 2.557370185852051 }, { "auxiliary_loss_clip": 0.01129063, "auxiliary_loss_mlp": 0.0104935, "balance_loss_clip": 1.02246845, "balance_loss_mlp": 1.03644657, "epoch": 0.10142792725086427, "flos": 31903980975360.0, "grad_norm": 2.880887399024693, "language_loss": 0.81339729, "learning_rate": 3.8995280352386344e-06, "loss": 0.83518136, "num_input_tokens_seen": 36427510, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.92578125, "step": 1687, "time_per_iteration": 2.5703492164611816 }, { "auxiliary_loss_clip": 0.01131885, "auxiliary_loss_mlp": 0.01052296, "balance_loss_clip": 1.02561688, "balance_loss_mlp": 1.03477442, "epoch": 0.10148805050353224, "flos": 28474637212800.0, "grad_norm": 1.9758616894600414, "language_loss": 0.71980017, "learning_rate": 3.899409755443785e-06, "loss": 0.741642, "num_input_tokens_seen": 36448230, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.97265625, "step": 1688, "time_per_iteration": 2.483548164367676 }, { "auxiliary_loss_clip": 0.01126728, "auxiliary_loss_mlp": 0.0105274, "balance_loss_clip": 1.02780175, "balance_loss_mlp": 1.03525734, "epoch": 0.1015481737562002, "flos": 25147903536000.0, "grad_norm": 2.4480015311261627, "language_loss": 0.86638576, "learning_rate": 3.899291407863887e-06, "loss": 0.88818043, "num_input_tokens_seen": 36464395, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.9140625, "step": 1689, "time_per_iteration": 2.476289749145508 }, { "auxiliary_loss_clip": 0.01123046, "auxiliary_loss_mlp": 0.01043488, "balance_loss_clip": 1.01890743, "balance_loss_mlp": 1.03043246, "epoch": 0.10160829700886818, "flos": 30881479720320.0, "grad_norm": 1.7647076627727838, "language_loss": 0.88198733, "learning_rate": 3.899172992503165e-06, "loss": 0.90365267, "num_input_tokens_seen": 36486475, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.92578125, "step": 1690, "time_per_iteration": 2.602295160293579 }, { "auxiliary_loss_clip": 0.01126595, "auxiliary_loss_mlp": 0.01044899, "balance_loss_clip": 1.02053297, "balance_loss_mlp": 1.03325152, "epoch": 0.10166842026153615, "flos": 20410992334080.0, "grad_norm": 2.4958265577871694, "language_loss": 0.83553779, "learning_rate": 3.899054509365843e-06, "loss": 0.85725272, "num_input_tokens_seen": 36505310, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.93359375, "step": 1691, "time_per_iteration": 2.4795753955841064 }, { "auxiliary_loss_clip": 0.01127851, "auxiliary_loss_mlp": 0.01048939, "balance_loss_clip": 1.02351224, "balance_loss_mlp": 1.03416157, "epoch": 0.10172854351420411, "flos": 33475491360000.0, "grad_norm": 1.5062507315521056, "language_loss": 0.6655491, "learning_rate": 3.89893595845615e-06, "loss": 0.68731701, "num_input_tokens_seen": 36529820, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.9375, "step": 1692, "time_per_iteration": 2.5656216144561768 }, { "auxiliary_loss_clip": 0.01126495, "auxiliary_loss_mlp": 0.01053501, "balance_loss_clip": 1.02723956, "balance_loss_mlp": 1.03327119, "epoch": 0.1017886667668721, "flos": 23549195335680.0, "grad_norm": 1.6614169439877764, "language_loss": 0.75763559, "learning_rate": 3.898817339778319e-06, "loss": 0.77943558, "num_input_tokens_seen": 36549000, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.9296875, "step": 1693, "time_per_iteration": 2.5104470252990723 }, { "auxiliary_loss_clip": 0.01126969, "auxiliary_loss_mlp": 0.01048846, "balance_loss_clip": 1.02250028, "balance_loss_mlp": 1.03276646, "epoch": 0.10184879001954006, "flos": 23294922837120.0, "grad_norm": 1.6836904940668604, "language_loss": 0.8728255, "learning_rate": 3.898698653336581e-06, "loss": 0.89458358, "num_input_tokens_seen": 36567515, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.9453125, "step": 1694, "time_per_iteration": 2.4375650882720947 }, { "auxiliary_loss_clip": 0.01130871, "auxiliary_loss_mlp": 0.01048342, "balance_loss_clip": 1.02084053, "balance_loss_mlp": 1.0317018, "epoch": 0.10190891327220802, "flos": 18332123938560.0, "grad_norm": 2.3117364563831915, "language_loss": 0.7957328, "learning_rate": 3.8985798991351715e-06, "loss": 0.81752491, "num_input_tokens_seen": 36586190, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.9921875, "step": 1695, "time_per_iteration": 2.3535845279693604 }, { "auxiliary_loss_clip": 0.0112953, "auxiliary_loss_mlp": 0.01053714, "balance_loss_clip": 1.02674866, "balance_loss_mlp": 1.03284919, "epoch": 0.10196903652487599, "flos": 26464268638080.0, "grad_norm": 1.771862970932022, "language_loss": 0.86338663, "learning_rate": 3.898461077178329e-06, "loss": 0.88521904, "num_input_tokens_seen": 36607495, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.96875, "step": 1696, "time_per_iteration": 2.464308261871338 }, { "auxiliary_loss_clip": 0.01124274, "auxiliary_loss_mlp": 0.0105109, "balance_loss_clip": 1.02643776, "balance_loss_mlp": 1.0333662, "epoch": 0.10202915977754397, "flos": 21868511529600.0, "grad_norm": 1.8515751852928584, "language_loss": 0.82061327, "learning_rate": 3.898342187470296e-06, "loss": 0.84236693, "num_input_tokens_seen": 36628555, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.91015625, "step": 1697, "time_per_iteration": 2.4317948818206787 }, { "auxiliary_loss_clip": 0.01128753, "auxiliary_loss_mlp": 0.01047324, "balance_loss_clip": 1.02047801, "balance_loss_mlp": 1.03356361, "epoch": 0.10208928303021193, "flos": 22308661440000.0, "grad_norm": 2.003597479785428, "language_loss": 0.80216718, "learning_rate": 3.898223230015311e-06, "loss": 0.82392788, "num_input_tokens_seen": 36646250, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.94921875, "step": 1698, "time_per_iteration": 2.4211795330047607 }, { "auxiliary_loss_clip": 0.01126562, "auxiliary_loss_mlp": 0.01038661, "balance_loss_clip": 1.01368654, "balance_loss_mlp": 1.03277946, "epoch": 0.1021494062828799, "flos": 22124529596160.0, "grad_norm": 3.2384389278339683, "language_loss": 0.75553715, "learning_rate": 3.8981042048176235e-06, "loss": 0.77718937, "num_input_tokens_seen": 36666675, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.9375, "step": 1699, "time_per_iteration": 2.40696120262146 }, { "auxiliary_loss_clip": 0.01126844, "auxiliary_loss_mlp": 0.01043679, "balance_loss_clip": 1.01847851, "balance_loss_mlp": 1.03395414, "epoch": 0.10220952953554788, "flos": 19645696131840.0, "grad_norm": 1.660742949698858, "language_loss": 0.79711759, "learning_rate": 3.897985111881478e-06, "loss": 0.81882286, "num_input_tokens_seen": 36685225, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.9296875, "step": 1700, "time_per_iteration": 2.3866331577301025 }, { "auxiliary_loss_clip": 0.01125573, "auxiliary_loss_mlp": 0.01046093, "balance_loss_clip": 1.02139282, "balance_loss_mlp": 1.03189898, "epoch": 0.10226965278821584, "flos": 29786044901760.0, "grad_norm": 1.7820552114416843, "language_loss": 0.77036595, "learning_rate": 3.897865951211127e-06, "loss": 0.79208261, "num_input_tokens_seen": 36705985, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.9375, "step": 1701, "time_per_iteration": 2.4395692348480225 }, { "auxiliary_loss_clip": 0.01129793, "auxiliary_loss_mlp": 0.01043111, "balance_loss_clip": 1.01714671, "balance_loss_mlp": 1.03423548, "epoch": 0.10232977604088381, "flos": 27015581917440.0, "grad_norm": 2.8523336534744077, "language_loss": 0.78233707, "learning_rate": 3.897746722810822e-06, "loss": 0.80406612, "num_input_tokens_seen": 36725815, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.953125, "step": 1702, "time_per_iteration": 2.436391592025757 }, { "auxiliary_loss_clip": 0.01124749, "auxiliary_loss_mlp": 0.01042127, "balance_loss_clip": 1.01767766, "balance_loss_mlp": 1.03302264, "epoch": 0.10238989929355179, "flos": 20776463112960.0, "grad_norm": 2.0057943972936663, "language_loss": 0.94855535, "learning_rate": 3.897627426684818e-06, "loss": 0.97022408, "num_input_tokens_seen": 36742345, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.91796875, "step": 1703, "time_per_iteration": 2.3969039916992188 }, { "auxiliary_loss_clip": 0.01033287, "auxiliary_loss_mlp": 0.01006063, "balance_loss_clip": 1.00258231, "balance_loss_mlp": 1.00845075, "epoch": 0.10245002254621975, "flos": 57695297690880.0, "grad_norm": 0.8679415430569597, "language_loss": 0.55032927, "learning_rate": 3.897508062837372e-06, "loss": 0.57072276, "num_input_tokens_seen": 36798775, "router_z_loss_clip": 0.03491211, "router_z_loss_mlp": 0.24804688, "step": 1704, "time_per_iteration": 2.9495198726654053 }, { "auxiliary_loss_clip": 0.01126333, "auxiliary_loss_mlp": 0.01041418, "balance_loss_clip": 1.01575208, "balance_loss_mlp": 1.03269148, "epoch": 0.10251014579888772, "flos": 16799192472960.0, "grad_norm": 2.1628112329074147, "language_loss": 0.83624583, "learning_rate": 3.897388631272745e-06, "loss": 0.85792339, "num_input_tokens_seen": 36816295, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.9375, "step": 1705, "time_per_iteration": 2.3643715381622314 }, { "auxiliary_loss_clip": 0.01028383, "auxiliary_loss_mlp": 0.01004697, "balance_loss_clip": 1.0013113, "balance_loss_mlp": 1.00360084, "epoch": 0.1025702690515557, "flos": 68562328832640.0, "grad_norm": 0.7604867521608939, "language_loss": 0.60402644, "learning_rate": 3.8972691319951975e-06, "loss": 0.62435722, "num_input_tokens_seen": 36882030, "router_z_loss_clip": 0.03393555, "router_z_loss_mlp": 0.24804688, "step": 1706, "time_per_iteration": 3.0974771976470947 }, { "auxiliary_loss_clip": 0.01129113, "auxiliary_loss_mlp": 0.01045736, "balance_loss_clip": 1.02063107, "balance_loss_mlp": 1.03540444, "epoch": 0.10263039230422366, "flos": 14865737356800.0, "grad_norm": 2.324386550135008, "language_loss": 0.86008024, "learning_rate": 3.897149565008996e-06, "loss": 0.88182867, "num_input_tokens_seen": 36899245, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.9375, "step": 1707, "time_per_iteration": 2.3840553760528564 }, { "auxiliary_loss_clip": 0.01126693, "auxiliary_loss_mlp": 0.0104306, "balance_loss_clip": 1.01841986, "balance_loss_mlp": 1.03309667, "epoch": 0.10269051555689163, "flos": 25336434211200.0, "grad_norm": 1.4926963578695893, "language_loss": 0.7271347, "learning_rate": 3.897029930318406e-06, "loss": 0.74883235, "num_input_tokens_seen": 36920950, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.9375, "step": 1708, "time_per_iteration": 2.418433904647827 }, { "auxiliary_loss_clip": 0.01124308, "auxiliary_loss_mlp": 0.01049843, "balance_loss_clip": 1.02458286, "balance_loss_mlp": 1.03183699, "epoch": 0.10275063880955959, "flos": 21067778430720.0, "grad_norm": 1.7060560352128098, "language_loss": 0.91177273, "learning_rate": 3.8969102279276974e-06, "loss": 0.93351424, "num_input_tokens_seen": 36938900, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.92578125, "step": 1709, "time_per_iteration": 2.3767495155334473 }, { "auxiliary_loss_clip": 0.01125528, "auxiliary_loss_mlp": 0.01043814, "balance_loss_clip": 1.01924491, "balance_loss_mlp": 1.03394461, "epoch": 0.10281076206222757, "flos": 30365638248960.0, "grad_norm": 2.6524481322250177, "language_loss": 0.88083231, "learning_rate": 3.896790457841142e-06, "loss": 0.90252578, "num_input_tokens_seen": 36957010, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.9140625, "step": 1710, "time_per_iteration": 2.436161994934082 }, { "auxiliary_loss_clip": 0.01120898, "auxiliary_loss_mlp": 0.01043799, "balance_loss_clip": 1.01883698, "balance_loss_mlp": 1.03276277, "epoch": 0.10287088531489554, "flos": 22417241368320.0, "grad_norm": 2.085302421561381, "language_loss": 0.79199672, "learning_rate": 3.896670620063015e-06, "loss": 0.81364369, "num_input_tokens_seen": 36977690, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.8828125, "step": 1711, "time_per_iteration": 2.4026315212249756 }, { "auxiliary_loss_clip": 0.01127818, "auxiliary_loss_mlp": 0.01048941, "balance_loss_clip": 1.0223453, "balance_loss_mlp": 1.03480744, "epoch": 0.1029310085675635, "flos": 25114910434560.0, "grad_norm": 2.9395482296819866, "language_loss": 0.73571283, "learning_rate": 3.896550714597592e-06, "loss": 0.75748044, "num_input_tokens_seen": 36997300, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.9296875, "step": 1712, "time_per_iteration": 2.428074598312378 }, { "auxiliary_loss_clip": 0.01124127, "auxiliary_loss_mlp": 0.01050446, "balance_loss_clip": 1.02728355, "balance_loss_mlp": 1.03561556, "epoch": 0.10299113182023148, "flos": 19864601556480.0, "grad_norm": 1.8027700892132426, "language_loss": 0.86771899, "learning_rate": 3.896430741449153e-06, "loss": 0.88946474, "num_input_tokens_seen": 37016110, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.88671875, "step": 1713, "time_per_iteration": 3.7723255157470703 }, { "auxiliary_loss_clip": 0.01124946, "auxiliary_loss_mlp": 0.01043759, "balance_loss_clip": 1.02115691, "balance_loss_mlp": 1.03272271, "epoch": 0.10305125507289944, "flos": 20446603787520.0, "grad_norm": 1.5695577095444464, "language_loss": 0.72571588, "learning_rate": 3.8963107006219785e-06, "loss": 0.74740291, "num_input_tokens_seen": 37036405, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.921875, "step": 1714, "time_per_iteration": 2.414334535598755 }, { "auxiliary_loss_clip": 0.01130338, "auxiliary_loss_mlp": 0.01050869, "balance_loss_clip": 1.02520323, "balance_loss_mlp": 1.03358746, "epoch": 0.10311137832556741, "flos": 26249552576640.0, "grad_norm": 2.3658421272882806, "language_loss": 0.90832257, "learning_rate": 3.896190592120353e-06, "loss": 0.93013465, "num_input_tokens_seen": 37057580, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.96875, "step": 1715, "time_per_iteration": 3.8215456008911133 }, { "auxiliary_loss_clip": 0.01126425, "auxiliary_loss_mlp": 0.01044727, "balance_loss_clip": 1.01991987, "balance_loss_mlp": 1.03373456, "epoch": 0.10317150157823539, "flos": 35297468904960.0, "grad_norm": 2.4404738181742807, "language_loss": 0.75811809, "learning_rate": 3.896070415948563e-06, "loss": 0.77982962, "num_input_tokens_seen": 37079120, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.92578125, "step": 1716, "time_per_iteration": 2.520768165588379 }, { "auxiliary_loss_clip": 0.01131289, "auxiliary_loss_mlp": 0.01053478, "balance_loss_clip": 1.02689457, "balance_loss_mlp": 1.03468513, "epoch": 0.10323162483090335, "flos": 25738738341120.0, "grad_norm": 1.8637532906378036, "language_loss": 0.8557725, "learning_rate": 3.895950172110897e-06, "loss": 0.87762022, "num_input_tokens_seen": 37099710, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.96484375, "step": 1717, "time_per_iteration": 3.9256229400634766 }, { "auxiliary_loss_clip": 0.01126121, "auxiliary_loss_mlp": 0.01052604, "balance_loss_clip": 1.02903628, "balance_loss_mlp": 1.03377187, "epoch": 0.10329174808357132, "flos": 16288936819200.0, "grad_norm": 1.8295567488097717, "language_loss": 0.8306402, "learning_rate": 3.895829860611646e-06, "loss": 0.85242748, "num_input_tokens_seen": 37117775, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.921875, "step": 1718, "time_per_iteration": 2.3728268146514893 }, { "auxiliary_loss_clip": 0.01126617, "auxiliary_loss_mlp": 0.01044083, "balance_loss_clip": 1.02005076, "balance_loss_mlp": 1.0349555, "epoch": 0.10335187133623928, "flos": 36685615495680.0, "grad_norm": 1.9096384483571365, "language_loss": 0.72850704, "learning_rate": 3.895709481455105e-06, "loss": 0.75021404, "num_input_tokens_seen": 37140280, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.91796875, "step": 1719, "time_per_iteration": 2.532106399536133 }, { "auxiliary_loss_clip": 0.0112546, "auxiliary_loss_mlp": 0.01044799, "balance_loss_clip": 1.02068257, "balance_loss_mlp": 1.03439403, "epoch": 0.10341199458890726, "flos": 14974771132800.0, "grad_norm": 2.126221701693877, "language_loss": 0.92706668, "learning_rate": 3.895589034645568e-06, "loss": 0.94876933, "num_input_tokens_seen": 37158350, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.91015625, "step": 1720, "time_per_iteration": 2.364673137664795 }, { "auxiliary_loss_clip": 0.01125416, "auxiliary_loss_mlp": 0.01043045, "balance_loss_clip": 1.01660395, "balance_loss_mlp": 1.03304362, "epoch": 0.10347211784157523, "flos": 21030561054720.0, "grad_norm": 2.0938448238925855, "language_loss": 0.79727536, "learning_rate": 3.8954685201873344e-06, "loss": 0.81895995, "num_input_tokens_seen": 37177120, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.921875, "step": 1721, "time_per_iteration": 2.389920949935913 }, { "auxiliary_loss_clip": 0.01129995, "auxiliary_loss_mlp": 0.01047324, "balance_loss_clip": 1.02155101, "balance_loss_mlp": 1.03391886, "epoch": 0.1035322410942432, "flos": 19791074499840.0, "grad_norm": 3.5107157029297373, "language_loss": 0.80865979, "learning_rate": 3.895347938084706e-06, "loss": 0.83043295, "num_input_tokens_seen": 37195895, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.9609375, "step": 1722, "time_per_iteration": 2.397848129272461 }, { "auxiliary_loss_clip": 0.01026926, "auxiliary_loss_mlp": 0.01003916, "balance_loss_clip": 0.99988711, "balance_loss_mlp": 1.00182128, "epoch": 0.10359236434691117, "flos": 52696014554880.0, "grad_norm": 0.9208959392453442, "language_loss": 0.6713531, "learning_rate": 3.895227288341984e-06, "loss": 0.6916616, "num_input_tokens_seen": 37247270, "router_z_loss_clip": 0.0402832, "router_z_loss_mlp": 0.25195312, "step": 1723, "time_per_iteration": 2.7822790145874023 }, { "auxiliary_loss_clip": 0.01127009, "auxiliary_loss_mlp": 0.01054408, "balance_loss_clip": 1.02935004, "balance_loss_mlp": 1.03282261, "epoch": 0.10365248759957914, "flos": 18404429097600.0, "grad_norm": 3.3654607027691053, "language_loss": 0.77949142, "learning_rate": 3.8951065709634755e-06, "loss": 0.80130565, "num_input_tokens_seen": 37265595, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.94140625, "step": 1724, "time_per_iteration": 2.4069485664367676 }, { "auxiliary_loss_clip": 0.0113112, "auxiliary_loss_mlp": 0.0105769, "balance_loss_clip": 1.03269172, "balance_loss_mlp": 1.03416538, "epoch": 0.1037126108522471, "flos": 47551878587520.0, "grad_norm": 1.7015623873393253, "language_loss": 0.74837613, "learning_rate": 3.8949857859534884e-06, "loss": 0.77026427, "num_input_tokens_seen": 37286660, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.97265625, "step": 1725, "time_per_iteration": 2.601824998855591 }, { "auxiliary_loss_clip": 0.01125892, "auxiliary_loss_mlp": 0.01055733, "balance_loss_clip": 1.03295231, "balance_loss_mlp": 1.034518, "epoch": 0.10377273410491508, "flos": 22815670337280.0, "grad_norm": 1.9342342234349115, "language_loss": 0.74688578, "learning_rate": 3.894864933316333e-06, "loss": 0.76870197, "num_input_tokens_seen": 37304915, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.9140625, "step": 1726, "time_per_iteration": 2.403597116470337 }, { "auxiliary_loss_clip": 0.0112702, "auxiliary_loss_mlp": 0.01050749, "balance_loss_clip": 1.02503598, "balance_loss_mlp": 1.03321671, "epoch": 0.10383285735758305, "flos": 26137551335040.0, "grad_norm": 1.9402703290147678, "language_loss": 0.72989267, "learning_rate": 3.894744013056322e-06, "loss": 0.75167036, "num_input_tokens_seen": 37325265, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.9375, "step": 1727, "time_per_iteration": 2.416836738586426 }, { "auxiliary_loss_clip": 0.01125883, "auxiliary_loss_mlp": 0.01053341, "balance_loss_clip": 1.02731776, "balance_loss_mlp": 1.03287256, "epoch": 0.10389298061025101, "flos": 17090856904320.0, "grad_norm": 2.1202233462362714, "language_loss": 0.8460077, "learning_rate": 3.894623025177772e-06, "loss": 0.8678, "num_input_tokens_seen": 37341650, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.9296875, "step": 1728, "time_per_iteration": 2.358876943588257 }, { "auxiliary_loss_clip": 0.01125422, "auxiliary_loss_mlp": 0.01048718, "balance_loss_clip": 1.02230167, "balance_loss_mlp": 1.03374028, "epoch": 0.10395310386291898, "flos": 20775485594880.0, "grad_norm": 2.289753052034552, "language_loss": 0.70360857, "learning_rate": 3.894501969684999e-06, "loss": 0.7253499, "num_input_tokens_seen": 37360270, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.91796875, "step": 1729, "time_per_iteration": 2.4003336429595947 }, { "auxiliary_loss_clip": 0.01123786, "auxiliary_loss_mlp": 0.01048196, "balance_loss_clip": 1.02238703, "balance_loss_mlp": 1.032902, "epoch": 0.10401322711558696, "flos": 12819792240000.0, "grad_norm": 2.4446926699856104, "language_loss": 0.81571615, "learning_rate": 3.894380846582324e-06, "loss": 0.83743596, "num_input_tokens_seen": 37375225, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.91015625, "step": 1730, "time_per_iteration": 2.360623598098755 }, { "auxiliary_loss_clip": 0.01121029, "auxiliary_loss_mlp": 0.0103754, "balance_loss_clip": 1.01527131, "balance_loss_mlp": 1.03012872, "epoch": 0.10407335036825492, "flos": 23183584911360.0, "grad_norm": 1.7366924144958773, "language_loss": 0.75999582, "learning_rate": 3.89425965587407e-06, "loss": 0.78158152, "num_input_tokens_seen": 37395165, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.91015625, "step": 1731, "time_per_iteration": 2.3825974464416504 }, { "auxiliary_loss_clip": 0.01120133, "auxiliary_loss_mlp": 0.01043516, "balance_loss_clip": 1.02133155, "balance_loss_mlp": 1.03189421, "epoch": 0.10413347362092289, "flos": 26102987222400.0, "grad_norm": 2.4071442074963687, "language_loss": 0.82738227, "learning_rate": 3.894138397564562e-06, "loss": 0.84901875, "num_input_tokens_seen": 37414845, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.8828125, "step": 1732, "time_per_iteration": 2.423374891281128 }, { "auxiliary_loss_clip": 0.01122988, "auxiliary_loss_mlp": 0.01041163, "balance_loss_clip": 1.01872814, "balance_loss_mlp": 1.03322685, "epoch": 0.10419359687359087, "flos": 12640233784320.0, "grad_norm": 2.216770909856606, "language_loss": 0.83156657, "learning_rate": 3.894017071658125e-06, "loss": 0.85320818, "num_input_tokens_seen": 37432490, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.8984375, "step": 1733, "time_per_iteration": 2.3525197505950928 }, { "auxiliary_loss_clip": 0.01125276, "auxiliary_loss_mlp": 0.01040831, "balance_loss_clip": 1.01758552, "balance_loss_mlp": 1.03219247, "epoch": 0.10425372012625883, "flos": 12124427224320.0, "grad_norm": 2.1742975654993333, "language_loss": 0.76333314, "learning_rate": 3.893895678159092e-06, "loss": 0.78499418, "num_input_tokens_seen": 37449435, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.9296875, "step": 1734, "time_per_iteration": 2.3762547969818115 }, { "auxiliary_loss_clip": 0.01120024, "auxiliary_loss_mlp": 0.01038903, "balance_loss_clip": 1.01643229, "balance_loss_mlp": 1.03039026, "epoch": 0.1043138433789268, "flos": 25336399299840.0, "grad_norm": 1.7597136442224786, "language_loss": 0.75126266, "learning_rate": 3.8937742170717935e-06, "loss": 0.77285194, "num_input_tokens_seen": 37469105, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.8984375, "step": 1735, "time_per_iteration": 2.4201161861419678 }, { "auxiliary_loss_clip": 0.01123744, "auxiliary_loss_mlp": 0.01043558, "balance_loss_clip": 1.01808333, "balance_loss_mlp": 1.0322299, "epoch": 0.10437396663159478, "flos": 29165917599360.0, "grad_norm": 1.638933424974742, "language_loss": 0.7859149, "learning_rate": 3.893652688400565e-06, "loss": 0.80758798, "num_input_tokens_seen": 37490540, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.9140625, "step": 1736, "time_per_iteration": 2.5261738300323486 }, { "auxiliary_loss_clip": 0.01122254, "auxiliary_loss_mlp": 0.01055381, "balance_loss_clip": 1.03001356, "balance_loss_mlp": 1.03328729, "epoch": 0.10443408988426274, "flos": 25079822651520.0, "grad_norm": 1.8328233590421816, "language_loss": 0.70703518, "learning_rate": 3.893531092149743e-06, "loss": 0.72881156, "num_input_tokens_seen": 37511905, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.890625, "step": 1737, "time_per_iteration": 2.435969352722168 }, { "auxiliary_loss_clip": 0.01126007, "auxiliary_loss_mlp": 0.01052773, "balance_loss_clip": 1.02487803, "balance_loss_mlp": 1.03007066, "epoch": 0.1044942131369307, "flos": 26758481598720.0, "grad_norm": 1.781210849877685, "language_loss": 0.81362653, "learning_rate": 3.893409428323666e-06, "loss": 0.83541435, "num_input_tokens_seen": 37533635, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.9609375, "step": 1738, "time_per_iteration": 2.429069757461548 }, { "auxiliary_loss_clip": 0.01124425, "auxiliary_loss_mlp": 0.01053833, "balance_loss_clip": 1.02822733, "balance_loss_mlp": 1.03121376, "epoch": 0.10455433638959867, "flos": 18441576650880.0, "grad_norm": 1.8175202610349077, "language_loss": 0.74855512, "learning_rate": 3.8932876969266785e-06, "loss": 0.7703377, "num_input_tokens_seen": 37552035, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.93359375, "step": 1739, "time_per_iteration": 2.4002676010131836 }, { "auxiliary_loss_clip": 0.01121714, "auxiliary_loss_mlp": 0.01050747, "balance_loss_clip": 1.02702391, "balance_loss_mlp": 1.03128552, "epoch": 0.10461445964226665, "flos": 23217939555840.0, "grad_norm": 3.3126482199985987, "language_loss": 0.77350897, "learning_rate": 3.893165897963123e-06, "loss": 0.79523361, "num_input_tokens_seen": 37571540, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.90625, "step": 1740, "time_per_iteration": 2.390667676925659 }, { "auxiliary_loss_clip": 0.01124322, "auxiliary_loss_mlp": 0.01046332, "balance_loss_clip": 1.02300251, "balance_loss_mlp": 1.03353405, "epoch": 0.10467458289493461, "flos": 24344307705600.0, "grad_norm": 2.0689127159114054, "language_loss": 0.8588016, "learning_rate": 3.893044031437346e-06, "loss": 0.88050812, "num_input_tokens_seen": 37588265, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.90625, "step": 1741, "time_per_iteration": 2.3992786407470703 }, { "auxiliary_loss_clip": 0.01124591, "auxiliary_loss_mlp": 0.0104551, "balance_loss_clip": 1.01955867, "balance_loss_mlp": 1.03237677, "epoch": 0.10473470614760258, "flos": 21286893323520.0, "grad_norm": 2.532080922859773, "language_loss": 0.75275385, "learning_rate": 3.892922097353697e-06, "loss": 0.77445483, "num_input_tokens_seen": 37606860, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.921875, "step": 1742, "time_per_iteration": 2.376932382583618 }, { "auxiliary_loss_clip": 0.01124536, "auxiliary_loss_mlp": 0.01048128, "balance_loss_clip": 1.02493, "balance_loss_mlp": 1.03433001, "epoch": 0.10479482940027056, "flos": 21686195076480.0, "grad_norm": 1.9615184924185378, "language_loss": 0.86979914, "learning_rate": 3.8928000957165275e-06, "loss": 0.89152575, "num_input_tokens_seen": 37625210, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.90234375, "step": 1743, "time_per_iteration": 2.388972520828247 }, { "auxiliary_loss_clip": 0.01121899, "auxiliary_loss_mlp": 0.01047988, "balance_loss_clip": 1.02152383, "balance_loss_mlp": 1.0318445, "epoch": 0.10485495265293852, "flos": 21572797380480.0, "grad_norm": 1.980346536726369, "language_loss": 0.75399542, "learning_rate": 3.8926780265301915e-06, "loss": 0.77569425, "num_input_tokens_seen": 37644110, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.90234375, "step": 1744, "time_per_iteration": 2.3914849758148193 }, { "auxiliary_loss_clip": 0.01124598, "auxiliary_loss_mlp": 0.01050524, "balance_loss_clip": 1.02765965, "balance_loss_mlp": 1.03332758, "epoch": 0.10491507590560649, "flos": 37960399301760.0, "grad_norm": 1.8428502923857146, "language_loss": 0.78735441, "learning_rate": 3.8925558897990445e-06, "loss": 0.80910563, "num_input_tokens_seen": 37665800, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.9140625, "step": 1745, "time_per_iteration": 2.5292773246765137 }, { "auxiliary_loss_clip": 0.01122432, "auxiliary_loss_mlp": 0.01057055, "balance_loss_clip": 1.03259289, "balance_loss_mlp": 1.0313642, "epoch": 0.10497519915827447, "flos": 26395070590080.0, "grad_norm": 2.7675134126447194, "language_loss": 0.82449567, "learning_rate": 3.892433685527447e-06, "loss": 0.84629059, "num_input_tokens_seen": 37685095, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.91015625, "step": 1746, "time_per_iteration": 2.4199109077453613 }, { "auxiliary_loss_clip": 0.0112421, "auxiliary_loss_mlp": 0.0105103, "balance_loss_clip": 1.02669919, "balance_loss_mlp": 1.03329909, "epoch": 0.10503532241094243, "flos": 40660581985920.0, "grad_norm": 1.6092919705029667, "language_loss": 0.69958377, "learning_rate": 3.892311413719759e-06, "loss": 0.72133613, "num_input_tokens_seen": 37707445, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.90625, "step": 1747, "time_per_iteration": 2.571589708328247 }, { "auxiliary_loss_clip": 0.01128556, "auxiliary_loss_mlp": 0.01052369, "balance_loss_clip": 1.02750218, "balance_loss_mlp": 1.03342628, "epoch": 0.1050954456636104, "flos": 29788104672000.0, "grad_norm": 2.2964050379853744, "language_loss": 0.84260982, "learning_rate": 3.892189074380345e-06, "loss": 0.86441904, "num_input_tokens_seen": 37728325, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.953125, "step": 1748, "time_per_iteration": 2.4443163871765137 }, { "auxiliary_loss_clip": 0.01117316, "auxiliary_loss_mlp": 0.01042685, "balance_loss_clip": 1.01779461, "balance_loss_mlp": 1.02924657, "epoch": 0.10515556891627838, "flos": 23947694127360.0, "grad_norm": 2.0563322181054393, "language_loss": 0.71392345, "learning_rate": 3.892066667513569e-06, "loss": 0.73552346, "num_input_tokens_seen": 37748910, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.8828125, "step": 1749, "time_per_iteration": 2.41198468208313 }, { "auxiliary_loss_clip": 0.01118943, "auxiliary_loss_mlp": 0.01046165, "balance_loss_clip": 1.02150106, "balance_loss_mlp": 1.03067636, "epoch": 0.10521569216894634, "flos": 18258631793280.0, "grad_norm": 2.09474354965328, "language_loss": 0.81900769, "learning_rate": 3.891944193123801e-06, "loss": 0.84065878, "num_input_tokens_seen": 37765745, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.8828125, "step": 1750, "time_per_iteration": 2.3500618934631348 }, { "auxiliary_loss_clip": 0.01127944, "auxiliary_loss_mlp": 0.01055871, "balance_loss_clip": 1.03163528, "balance_loss_mlp": 1.03505528, "epoch": 0.10527581542161431, "flos": 15630056040960.0, "grad_norm": 2.155919689446535, "language_loss": 0.92280161, "learning_rate": 3.891821651215411e-06, "loss": 0.9446398, "num_input_tokens_seen": 37780520, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.9296875, "step": 1751, "time_per_iteration": 2.38798189163208 }, { "auxiliary_loss_clip": 0.01121446, "auxiliary_loss_mlp": 0.01042405, "balance_loss_clip": 1.01945722, "balance_loss_mlp": 1.03214347, "epoch": 0.10533593867428227, "flos": 18295569878400.0, "grad_norm": 3.1524830188228834, "language_loss": 0.78899848, "learning_rate": 3.8916990417927735e-06, "loss": 0.810637, "num_input_tokens_seen": 37799515, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.89453125, "step": 1752, "time_per_iteration": 2.3802599906921387 }, { "auxiliary_loss_clip": 0.01121154, "auxiliary_loss_mlp": 0.01045128, "balance_loss_clip": 1.0206542, "balance_loss_mlp": 1.03210664, "epoch": 0.10539606192695025, "flos": 29021935685760.0, "grad_norm": 1.8613766788519057, "language_loss": 0.75671118, "learning_rate": 3.891576364860262e-06, "loss": 0.77837402, "num_input_tokens_seen": 37818695, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.890625, "step": 1753, "time_per_iteration": 3.81923508644104 }, { "auxiliary_loss_clip": 0.01124279, "auxiliary_loss_mlp": 0.01053431, "balance_loss_clip": 1.02843332, "balance_loss_mlp": 1.03089023, "epoch": 0.10545618517961822, "flos": 19968433539840.0, "grad_norm": 1.8995117140353865, "language_loss": 0.83522022, "learning_rate": 3.891453620422258e-06, "loss": 0.85699737, "num_input_tokens_seen": 37837860, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.93359375, "step": 1754, "time_per_iteration": 3.7905337810516357 }, { "auxiliary_loss_clip": 0.01128729, "auxiliary_loss_mlp": 0.01050098, "balance_loss_clip": 1.0233947, "balance_loss_mlp": 1.03546464, "epoch": 0.10551630843228618, "flos": 16142511110400.0, "grad_norm": 2.7992506175283154, "language_loss": 0.6898886, "learning_rate": 3.891330808483137e-06, "loss": 0.71167684, "num_input_tokens_seen": 37856260, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.93359375, "step": 1755, "time_per_iteration": 2.345918655395508 }, { "auxiliary_loss_clip": 0.01127338, "auxiliary_loss_mlp": 0.01054693, "balance_loss_clip": 1.0284667, "balance_loss_mlp": 1.03414655, "epoch": 0.10557643168495416, "flos": 23439009484800.0, "grad_norm": 2.095319590645789, "language_loss": 0.76325703, "learning_rate": 3.891207929047286e-06, "loss": 0.78507727, "num_input_tokens_seen": 37876960, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.9296875, "step": 1756, "time_per_iteration": 2.4101569652557373 }, { "auxiliary_loss_clip": 0.01124114, "auxiliary_loss_mlp": 0.0104714, "balance_loss_clip": 1.02310705, "balance_loss_mlp": 1.03179097, "epoch": 0.10563655493762213, "flos": 21797951938560.0, "grad_norm": 1.7351198219289925, "language_loss": 0.79872441, "learning_rate": 3.8910849821190884e-06, "loss": 0.82043695, "num_input_tokens_seen": 37897070, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.921875, "step": 1757, "time_per_iteration": 3.8162782192230225 }, { "auxiliary_loss_clip": 0.0112361, "auxiliary_loss_mlp": 0.01044144, "balance_loss_clip": 1.01888347, "balance_loss_mlp": 1.03206515, "epoch": 0.10569667819029009, "flos": 53798782625280.0, "grad_norm": 1.5596724518209486, "language_loss": 0.78979349, "learning_rate": 3.890961967702933e-06, "loss": 0.81147099, "num_input_tokens_seen": 37923635, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.9140625, "step": 1758, "time_per_iteration": 2.692870616912842 }, { "auxiliary_loss_clip": 0.01127386, "auxiliary_loss_mlp": 0.01040808, "balance_loss_clip": 1.01633477, "balance_loss_mlp": 1.03447723, "epoch": 0.10575680144295807, "flos": 22924529556480.0, "grad_norm": 1.7044128071529396, "language_loss": 0.91619074, "learning_rate": 3.890838885803208e-06, "loss": 0.93787271, "num_input_tokens_seen": 37942650, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.9296875, "step": 1759, "time_per_iteration": 2.395379066467285 }, { "auxiliary_loss_clip": 0.01126309, "auxiliary_loss_mlp": 0.01055059, "balance_loss_clip": 1.02841604, "balance_loss_mlp": 1.0312767, "epoch": 0.10581692469562604, "flos": 14135808228480.0, "grad_norm": 1.9812907161966353, "language_loss": 0.77218324, "learning_rate": 3.890715736424307e-06, "loss": 0.79399687, "num_input_tokens_seen": 37960660, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.953125, "step": 1760, "time_per_iteration": 2.3628458976745605 }, { "auxiliary_loss_clip": 0.01124844, "auxiliary_loss_mlp": 0.01051971, "balance_loss_clip": 1.0255419, "balance_loss_mlp": 1.03168297, "epoch": 0.105877047948294, "flos": 25957469208960.0, "grad_norm": 3.0718633757371125, "language_loss": 0.8935079, "learning_rate": 3.890592519570626e-06, "loss": 0.91527599, "num_input_tokens_seen": 37978625, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.9296875, "step": 1761, "time_per_iteration": 2.4184231758117676 }, { "auxiliary_loss_clip": 0.01125491, "auxiliary_loss_mlp": 0.01053337, "balance_loss_clip": 1.02795768, "balance_loss_mlp": 1.03289485, "epoch": 0.10593717120096197, "flos": 30663447079680.0, "grad_norm": 2.260293059938474, "language_loss": 0.7777102, "learning_rate": 3.89046923524656e-06, "loss": 0.79949844, "num_input_tokens_seen": 38000005, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.92578125, "step": 1762, "time_per_iteration": 2.473712205886841 }, { "auxiliary_loss_clip": 0.01029718, "auxiliary_loss_mlp": 0.01002481, "balance_loss_clip": 0.99895269, "balance_loss_mlp": 1.00415659, "epoch": 0.10599729445362994, "flos": 66432905055360.0, "grad_norm": 0.7567652517232661, "language_loss": 0.60488772, "learning_rate": 3.8903458834565105e-06, "loss": 0.62520969, "num_input_tokens_seen": 38066165, "router_z_loss_clip": 0.03540039, "router_z_loss_mlp": 0.25585938, "step": 1763, "time_per_iteration": 3.18863844871521 }, { "auxiliary_loss_clip": 0.0112237, "auxiliary_loss_mlp": 0.01042503, "balance_loss_clip": 1.01810062, "balance_loss_mlp": 1.03152835, "epoch": 0.10605741770629791, "flos": 23947135545600.0, "grad_norm": 1.7391662287955905, "language_loss": 0.79645944, "learning_rate": 3.890222464204879e-06, "loss": 0.8181082, "num_input_tokens_seen": 38086150, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.90625, "step": 1764, "time_per_iteration": 2.401545524597168 }, { "auxiliary_loss_clip": 0.01123566, "auxiliary_loss_mlp": 0.01048595, "balance_loss_clip": 1.02396607, "balance_loss_mlp": 1.03268123, "epoch": 0.10611754095896588, "flos": 19386605865600.0, "grad_norm": 1.916558263318626, "language_loss": 0.80186951, "learning_rate": 3.89009897749607e-06, "loss": 0.82359111, "num_input_tokens_seen": 38104205, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.91015625, "step": 1765, "time_per_iteration": 2.383697748184204 }, { "auxiliary_loss_clip": 0.01120346, "auxiliary_loss_mlp": 0.01050923, "balance_loss_clip": 1.02638936, "balance_loss_mlp": 1.03064609, "epoch": 0.10617766421163385, "flos": 22236635571840.0, "grad_norm": 1.840882934516297, "language_loss": 0.76780617, "learning_rate": 3.88997542333449e-06, "loss": 0.78951889, "num_input_tokens_seen": 38122005, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.8984375, "step": 1766, "time_per_iteration": 2.376946210861206 }, { "auxiliary_loss_clip": 0.01125036, "auxiliary_loss_mlp": 0.01054262, "balance_loss_clip": 1.02844131, "balance_loss_mlp": 1.03260565, "epoch": 0.10623778746430182, "flos": 28403100103680.0, "grad_norm": 1.5994413753068162, "language_loss": 0.77417314, "learning_rate": 3.889851801724549e-06, "loss": 0.79596615, "num_input_tokens_seen": 38143365, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.92578125, "step": 1767, "time_per_iteration": 2.4704442024230957 }, { "auxiliary_loss_clip": 0.01028991, "auxiliary_loss_mlp": 0.0100806, "balance_loss_clip": 1.00436473, "balance_loss_mlp": 1.00339031, "epoch": 0.10629791071696978, "flos": 64231282719360.0, "grad_norm": 0.6754453633366562, "language_loss": 0.57893264, "learning_rate": 3.889728112670658e-06, "loss": 0.59930313, "num_input_tokens_seen": 38210035, "router_z_loss_clip": 0.03686523, "router_z_loss_mlp": 0.25585938, "step": 1768, "time_per_iteration": 3.1264593601226807 }, { "auxiliary_loss_clip": 0.01125592, "auxiliary_loss_mlp": 0.01039103, "balance_loss_clip": 1.01608372, "balance_loss_mlp": 1.03276598, "epoch": 0.10635803396963776, "flos": 22746472289280.0, "grad_norm": 1.3944339810139335, "language_loss": 0.86446828, "learning_rate": 3.8896043561772325e-06, "loss": 0.88611525, "num_input_tokens_seen": 38231230, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.9296875, "step": 1769, "time_per_iteration": 2.437068462371826 }, { "auxiliary_loss_clip": 0.0112854, "auxiliary_loss_mlp": 0.01050713, "balance_loss_clip": 1.02455854, "balance_loss_mlp": 1.0353334, "epoch": 0.10641815722230573, "flos": 31394214080640.0, "grad_norm": 2.81559546028732, "language_loss": 0.61949551, "learning_rate": 3.889480532248688e-06, "loss": 0.64128804, "num_input_tokens_seen": 38253890, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.9296875, "step": 1770, "time_per_iteration": 2.472952365875244 }, { "auxiliary_loss_clip": 0.01029669, "auxiliary_loss_mlp": 0.01003379, "balance_loss_clip": 0.99985087, "balance_loss_mlp": 1.00370085, "epoch": 0.1064782804749737, "flos": 58550077307520.0, "grad_norm": 1.1358638098678222, "language_loss": 0.57037234, "learning_rate": 3.889356640889444e-06, "loss": 0.59070289, "num_input_tokens_seen": 38304290, "router_z_loss_clip": 0.03540039, "router_z_loss_mlp": 0.25976562, "step": 1771, "time_per_iteration": 2.942660093307495 }, { "auxiliary_loss_clip": 0.01125322, "auxiliary_loss_mlp": 0.01058676, "balance_loss_clip": 1.0330584, "balance_loss_mlp": 1.03383589, "epoch": 0.10653840372764166, "flos": 23986691982720.0, "grad_norm": 1.6196900827448717, "language_loss": 0.88175607, "learning_rate": 3.8892326821039205e-06, "loss": 0.9035961, "num_input_tokens_seen": 38324725, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.9140625, "step": 1772, "time_per_iteration": 2.4082319736480713 }, { "auxiliary_loss_clip": 0.0112954, "auxiliary_loss_mlp": 0.01046109, "balance_loss_clip": 1.01948977, "balance_loss_mlp": 1.03332782, "epoch": 0.10659852698030964, "flos": 18293719576320.0, "grad_norm": 3.5867802309173427, "language_loss": 0.7572273, "learning_rate": 3.889108655896542e-06, "loss": 0.77898383, "num_input_tokens_seen": 38340735, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.96484375, "step": 1773, "time_per_iteration": 2.360600709915161 }, { "auxiliary_loss_clip": 0.01127654, "auxiliary_loss_mlp": 0.01051195, "balance_loss_clip": 1.02645874, "balance_loss_mlp": 1.03547025, "epoch": 0.1066586502329776, "flos": 32159230992000.0, "grad_norm": 1.8622521371175176, "language_loss": 0.82763404, "learning_rate": 3.888984562271736e-06, "loss": 0.84942257, "num_input_tokens_seen": 38361315, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.921875, "step": 1774, "time_per_iteration": 2.489536762237549 }, { "auxiliary_loss_clip": 0.01129712, "auxiliary_loss_mlp": 0.01051944, "balance_loss_clip": 1.02571797, "balance_loss_mlp": 1.03411245, "epoch": 0.10671877348564557, "flos": 17784197061120.0, "grad_norm": 2.2850490467324875, "language_loss": 0.76627076, "learning_rate": 3.888860401233929e-06, "loss": 0.78808731, "num_input_tokens_seen": 38377425, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.95703125, "step": 1775, "time_per_iteration": 2.3686110973358154 }, { "auxiliary_loss_clip": 0.01029928, "auxiliary_loss_mlp": 0.01003173, "balance_loss_clip": 0.99997795, "balance_loss_mlp": 1.00443852, "epoch": 0.10677889673831355, "flos": 63506695029120.0, "grad_norm": 0.8170942420256979, "language_loss": 0.57425374, "learning_rate": 3.8887361727875535e-06, "loss": 0.5945847, "num_input_tokens_seen": 38440275, "router_z_loss_clip": 0.03198242, "router_z_loss_mlp": 0.25390625, "step": 1776, "time_per_iteration": 3.05979585647583 }, { "auxiliary_loss_clip": 0.01126727, "auxiliary_loss_mlp": 0.010466, "balance_loss_clip": 1.02071953, "balance_loss_mlp": 1.03543973, "epoch": 0.10683901999098151, "flos": 22016612983680.0, "grad_norm": 1.5249944085465545, "language_loss": 0.8304826, "learning_rate": 3.888611876937043e-06, "loss": 0.85221589, "num_input_tokens_seen": 38461820, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.9140625, "step": 1777, "time_per_iteration": 2.4524457454681396 }, { "auxiliary_loss_clip": 0.01125564, "auxiliary_loss_mlp": 0.01047993, "balance_loss_clip": 1.02347136, "balance_loss_mlp": 1.03583741, "epoch": 0.10689914324364948, "flos": 25041872136960.0, "grad_norm": 3.6853021162900017, "language_loss": 0.87512541, "learning_rate": 3.888487513686832e-06, "loss": 0.89686102, "num_input_tokens_seen": 38482235, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.8984375, "step": 1778, "time_per_iteration": 2.4289801120758057 }, { "auxiliary_loss_clip": 0.01129984, "auxiliary_loss_mlp": 0.01050385, "balance_loss_clip": 1.024683, "balance_loss_mlp": 1.0366596, "epoch": 0.10695926649631746, "flos": 16434210453120.0, "grad_norm": 1.9132762909614143, "language_loss": 0.84370452, "learning_rate": 3.88836308304136e-06, "loss": 0.8655082, "num_input_tokens_seen": 38500690, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.93359375, "step": 1779, "time_per_iteration": 2.385535955429077 }, { "auxiliary_loss_clip": 0.01121678, "auxiliary_loss_mlp": 0.01045359, "balance_loss_clip": 1.02161276, "balance_loss_mlp": 1.03248048, "epoch": 0.10701938974898542, "flos": 16978366903680.0, "grad_norm": 1.940612983001018, "language_loss": 0.67382878, "learning_rate": 3.888238585005066e-06, "loss": 0.69549918, "num_input_tokens_seen": 38518405, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.890625, "step": 1780, "time_per_iteration": 2.3779759407043457 }, { "auxiliary_loss_clip": 0.01124395, "auxiliary_loss_mlp": 0.01048141, "balance_loss_clip": 1.02261877, "balance_loss_mlp": 1.03375697, "epoch": 0.10707951300165339, "flos": 15887191271040.0, "grad_norm": 2.120473397927048, "language_loss": 0.91888499, "learning_rate": 3.888114019582395e-06, "loss": 0.94061041, "num_input_tokens_seen": 38535060, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.90625, "step": 1781, "time_per_iteration": 2.366016387939453 }, { "auxiliary_loss_clip": 0.01125598, "auxiliary_loss_mlp": 0.01046815, "balance_loss_clip": 1.02105403, "balance_loss_mlp": 1.03395641, "epoch": 0.10713963625432135, "flos": 14246273370240.0, "grad_norm": 1.961229098536527, "language_loss": 0.79416013, "learning_rate": 3.887989386777791e-06, "loss": 0.81588423, "num_input_tokens_seen": 38552855, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.91796875, "step": 1782, "time_per_iteration": 2.359746217727661 }, { "auxiliary_loss_clip": 0.01126018, "auxiliary_loss_mlp": 0.01047457, "balance_loss_clip": 1.02292371, "balance_loss_mlp": 1.03313398, "epoch": 0.10719975950698933, "flos": 16756040165760.0, "grad_norm": 2.134353445152127, "language_loss": 0.78729677, "learning_rate": 3.887864686595703e-06, "loss": 0.80903149, "num_input_tokens_seen": 38570075, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.9296875, "step": 1783, "time_per_iteration": 2.3823351860046387 }, { "auxiliary_loss_clip": 0.01127351, "auxiliary_loss_mlp": 0.01051587, "balance_loss_clip": 1.02654111, "balance_loss_mlp": 1.03401184, "epoch": 0.1072598827596573, "flos": 22709534204160.0, "grad_norm": 1.9319803999437355, "language_loss": 0.86656928, "learning_rate": 3.887739919040579e-06, "loss": 0.88835871, "num_input_tokens_seen": 38587970, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.93359375, "step": 1784, "time_per_iteration": 2.3890769481658936 }, { "auxiliary_loss_clip": 0.01128957, "auxiliary_loss_mlp": 0.01048849, "balance_loss_clip": 1.02149022, "balance_loss_mlp": 1.03402746, "epoch": 0.10732000601232526, "flos": 23257146879360.0, "grad_norm": 2.588319712177381, "language_loss": 1.0069952, "learning_rate": 3.887615084116874e-06, "loss": 1.02877331, "num_input_tokens_seen": 38605840, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.94921875, "step": 1785, "time_per_iteration": 2.4029221534729004 }, { "auxiliary_loss_clip": 0.01123074, "auxiliary_loss_mlp": 0.01048438, "balance_loss_clip": 1.02413082, "balance_loss_mlp": 1.0350312, "epoch": 0.10738012926499324, "flos": 24205911609600.0, "grad_norm": 1.3298221024401562, "language_loss": 0.84858882, "learning_rate": 3.887490181829042e-06, "loss": 0.87030399, "num_input_tokens_seen": 38627070, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.8828125, "step": 1786, "time_per_iteration": 2.4200692176818848 }, { "auxiliary_loss_clip": 0.0112323, "auxiliary_loss_mlp": 0.01046963, "balance_loss_clip": 1.01944935, "balance_loss_mlp": 1.03104043, "epoch": 0.1074402525176612, "flos": 20922016037760.0, "grad_norm": 1.716800751345172, "language_loss": 0.78385222, "learning_rate": 3.887365212181542e-06, "loss": 0.80555415, "num_input_tokens_seen": 38645840, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 0.921875, "step": 1787, "time_per_iteration": 2.38824462890625 }, { "auxiliary_loss_clip": 0.01128656, "auxiliary_loss_mlp": 0.01050491, "balance_loss_clip": 1.02328789, "balance_loss_mlp": 1.03389168, "epoch": 0.10750037577032917, "flos": 16945967295360.0, "grad_norm": 1.7573360887890106, "language_loss": 0.82472336, "learning_rate": 3.88724017517883e-06, "loss": 0.84651482, "num_input_tokens_seen": 38664770, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.9453125, "step": 1788, "time_per_iteration": 2.3621251583099365 }, { "auxiliary_loss_clip": 0.01123994, "auxiliary_loss_mlp": 0.01050739, "balance_loss_clip": 1.02641988, "balance_loss_mlp": 1.03294051, "epoch": 0.10756049902299715, "flos": 20265509232000.0, "grad_norm": 1.838983977324372, "language_loss": 0.78195995, "learning_rate": 3.887115070825373e-06, "loss": 0.8037073, "num_input_tokens_seen": 38683865, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.91015625, "step": 1789, "time_per_iteration": 2.3786821365356445 }, { "auxiliary_loss_clip": 0.01126292, "auxiliary_loss_mlp": 0.01060179, "balance_loss_clip": 1.03047156, "balance_loss_mlp": 1.0334549, "epoch": 0.10762062227566511, "flos": 23585400282240.0, "grad_norm": 2.6683776947526225, "language_loss": 0.745278, "learning_rate": 3.886989899125632e-06, "loss": 0.76714271, "num_input_tokens_seen": 38702485, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 0.9296875, "step": 1790, "time_per_iteration": 2.4150373935699463 }, { "auxiliary_loss_clip": 0.01127052, "auxiliary_loss_mlp": 0.01061173, "balance_loss_clip": 1.03416061, "balance_loss_mlp": 1.03454113, "epoch": 0.10768074552833308, "flos": 24309638858880.0, "grad_norm": 2.0808194179309796, "language_loss": 0.78436476, "learning_rate": 3.886864660084075e-06, "loss": 0.806247, "num_input_tokens_seen": 38722475, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.92578125, "step": 1791, "time_per_iteration": 2.408830404281616 }, { "auxiliary_loss_clip": 0.01119221, "auxiliary_loss_mlp": 0.0104741, "balance_loss_clip": 1.02353215, "balance_loss_mlp": 1.03260255, "epoch": 0.10774086878100106, "flos": 25298832810240.0, "grad_norm": 1.8902431432210107, "language_loss": 0.70625722, "learning_rate": 3.886739353705173e-06, "loss": 0.72792351, "num_input_tokens_seen": 38743285, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.8671875, "step": 1792, "time_per_iteration": 3.9579362869262695 }, { "auxiliary_loss_clip": 0.01125953, "auxiliary_loss_mlp": 0.01043045, "balance_loss_clip": 1.0159843, "balance_loss_mlp": 1.03193712, "epoch": 0.10780099203366902, "flos": 22052957575680.0, "grad_norm": 1.8560698854611348, "language_loss": 0.75875032, "learning_rate": 3.886613979993396e-06, "loss": 0.78044033, "num_input_tokens_seen": 38763035, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.9375, "step": 1793, "time_per_iteration": 2.3957271575927734 }, { "auxiliary_loss_clip": 0.01124997, "auxiliary_loss_mlp": 0.01057005, "balance_loss_clip": 1.0321852, "balance_loss_mlp": 1.03507137, "epoch": 0.10786111528633699, "flos": 22746367555200.0, "grad_norm": 1.5432225075588661, "language_loss": 0.85082167, "learning_rate": 3.886488538953219e-06, "loss": 0.87264168, "num_input_tokens_seen": 38784900, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.8984375, "step": 1794, "time_per_iteration": 3.90265154838562 }, { "auxiliary_loss_clip": 0.01129109, "auxiliary_loss_mlp": 0.01044994, "balance_loss_clip": 1.01956701, "balance_loss_mlp": 1.0332222, "epoch": 0.10792123853900495, "flos": 20849990169600.0, "grad_norm": 1.9717999863669853, "language_loss": 0.7450695, "learning_rate": 3.8863630305891196e-06, "loss": 0.76681054, "num_input_tokens_seen": 38804695, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.9609375, "step": 1795, "time_per_iteration": 2.3823306560516357 }, { "auxiliary_loss_clip": 0.01127177, "auxiliary_loss_mlp": 0.01049021, "balance_loss_clip": 1.02110219, "balance_loss_mlp": 1.03276324, "epoch": 0.10798136179167293, "flos": 17747747735040.0, "grad_norm": 2.4639211054087333, "language_loss": 0.81476229, "learning_rate": 3.8862374549055755e-06, "loss": 0.83652425, "num_input_tokens_seen": 38822395, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 0.9453125, "step": 1796, "time_per_iteration": 3.7188520431518555 }, { "auxiliary_loss_clip": 0.01128867, "auxiliary_loss_mlp": 0.01058443, "balance_loss_clip": 1.03150129, "balance_loss_mlp": 1.03387094, "epoch": 0.1080414850443409, "flos": 13588789046400.0, "grad_norm": 2.2364337055104055, "language_loss": 0.73790944, "learning_rate": 3.886111811907069e-06, "loss": 0.75978255, "num_input_tokens_seen": 38839865, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.94921875, "step": 1797, "time_per_iteration": 3.7533183097839355 }, { "auxiliary_loss_clip": 0.01125669, "auxiliary_loss_mlp": 0.01046034, "balance_loss_clip": 1.02181053, "balance_loss_mlp": 1.03294826, "epoch": 0.10810160829700886, "flos": 16252487493120.0, "grad_norm": 2.129713269732791, "language_loss": 0.81432426, "learning_rate": 3.885986101598082e-06, "loss": 0.83604133, "num_input_tokens_seen": 38857300, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.9296875, "step": 1798, "time_per_iteration": 2.378171443939209 }, { "auxiliary_loss_clip": 0.01125419, "auxiliary_loss_mlp": 0.01047582, "balance_loss_clip": 1.02235758, "balance_loss_mlp": 1.0329802, "epoch": 0.10816173154967684, "flos": 15157122497280.0, "grad_norm": 2.29817641696348, "language_loss": 0.85187292, "learning_rate": 3.885860323983104e-06, "loss": 0.87360299, "num_input_tokens_seen": 38874960, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.92578125, "step": 1799, "time_per_iteration": 2.3468503952026367 }, { "auxiliary_loss_clip": 0.01122342, "auxiliary_loss_mlp": 0.01054019, "balance_loss_clip": 1.02958083, "balance_loss_mlp": 1.03431249, "epoch": 0.10822185480234481, "flos": 17784371617920.0, "grad_norm": 1.871905884018768, "language_loss": 0.76835096, "learning_rate": 3.885734479066622e-06, "loss": 0.79011458, "num_input_tokens_seen": 38893610, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.8828125, "step": 1800, "time_per_iteration": 2.383742094039917 }, { "auxiliary_loss_clip": 0.01119787, "auxiliary_loss_mlp": 0.01041734, "balance_loss_clip": 1.0178442, "balance_loss_mlp": 1.03130066, "epoch": 0.10828197805501277, "flos": 25555479281280.0, "grad_norm": 1.5186469739563766, "language_loss": 0.7293545, "learning_rate": 3.885608566853126e-06, "loss": 0.75096971, "num_input_tokens_seen": 38913485, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.88671875, "step": 1801, "time_per_iteration": 2.4169530868530273 }, { "auxiliary_loss_clip": 0.01129902, "auxiliary_loss_mlp": 0.01046469, "balance_loss_clip": 1.02205503, "balance_loss_mlp": 1.03348994, "epoch": 0.10834210130768075, "flos": 28983217121280.0, "grad_norm": 1.8508048354535698, "language_loss": 0.65805316, "learning_rate": 3.8854825873471115e-06, "loss": 0.67981684, "num_input_tokens_seen": 38935650, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.9609375, "step": 1802, "time_per_iteration": 2.4483463764190674 }, { "auxiliary_loss_clip": 0.01123685, "auxiliary_loss_mlp": 0.01052333, "balance_loss_clip": 1.02596378, "balance_loss_mlp": 1.03211331, "epoch": 0.10840222456034872, "flos": 20263239993600.0, "grad_norm": 3.1106185134885775, "language_loss": 0.81412292, "learning_rate": 3.885356540553073e-06, "loss": 0.83588308, "num_input_tokens_seen": 38954130, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.9140625, "step": 1803, "time_per_iteration": 2.37387752532959 }, { "auxiliary_loss_clip": 0.01120541, "auxiliary_loss_mlp": 0.01048911, "balance_loss_clip": 1.02432966, "balance_loss_mlp": 1.03114367, "epoch": 0.10846234781301668, "flos": 19862087938560.0, "grad_norm": 1.572617341333409, "language_loss": 0.91127855, "learning_rate": 3.88523042647551e-06, "loss": 0.93297303, "num_input_tokens_seen": 38972905, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.89453125, "step": 1804, "time_per_iteration": 2.3934566974639893 }, { "auxiliary_loss_clip": 0.01123549, "auxiliary_loss_mlp": 0.0104879, "balance_loss_clip": 1.02301693, "balance_loss_mlp": 1.03132033, "epoch": 0.10852247106568465, "flos": 26467829596800.0, "grad_norm": 2.1018894025612136, "language_loss": 0.76497591, "learning_rate": 3.885104245118921e-06, "loss": 0.78669924, "num_input_tokens_seen": 38993255, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.921875, "step": 1805, "time_per_iteration": 2.423750638961792 }, { "auxiliary_loss_clip": 0.01120164, "auxiliary_loss_mlp": 0.01040732, "balance_loss_clip": 1.01696146, "balance_loss_mlp": 1.03138614, "epoch": 0.10858259431835263, "flos": 30080188039680.0, "grad_norm": 1.9556052539861366, "language_loss": 0.86247486, "learning_rate": 3.8849779964878125e-06, "loss": 0.88408375, "num_input_tokens_seen": 39012610, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.890625, "step": 1806, "time_per_iteration": 2.451107978820801 }, { "auxiliary_loss_clip": 0.01122921, "auxiliary_loss_mlp": 0.01049866, "balance_loss_clip": 1.02549911, "balance_loss_mlp": 1.03105104, "epoch": 0.10864271757102059, "flos": 19062157800960.0, "grad_norm": 3.0931004384736887, "language_loss": 0.81229842, "learning_rate": 3.884851680586687e-06, "loss": 0.83402628, "num_input_tokens_seen": 39030120, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.91796875, "step": 1807, "time_per_iteration": 2.3664300441741943 }, { "auxiliary_loss_clip": 0.01120947, "auxiliary_loss_mlp": 0.01043449, "balance_loss_clip": 1.020895, "balance_loss_mlp": 1.03195763, "epoch": 0.10870284082368856, "flos": 24713967847680.0, "grad_norm": 1.8282252096160712, "language_loss": 0.7888785, "learning_rate": 3.884725297420053e-06, "loss": 0.81052244, "num_input_tokens_seen": 39049875, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.890625, "step": 1808, "time_per_iteration": 2.4256677627563477 }, { "auxiliary_loss_clip": 0.01125638, "auxiliary_loss_mlp": 0.01046616, "balance_loss_clip": 1.02194023, "balance_loss_mlp": 1.03399539, "epoch": 0.10876296407635654, "flos": 20626720824960.0, "grad_norm": 1.7326907662304973, "language_loss": 0.79196876, "learning_rate": 3.884598846992422e-06, "loss": 0.81369132, "num_input_tokens_seen": 39068935, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.9140625, "step": 1809, "time_per_iteration": 2.39125394821167 }, { "auxiliary_loss_clip": 0.01121593, "auxiliary_loss_mlp": 0.01045871, "balance_loss_clip": 1.0217669, "balance_loss_mlp": 1.03222513, "epoch": 0.1088230873290245, "flos": 21578767223040.0, "grad_norm": 1.9900982360126007, "language_loss": 0.84929574, "learning_rate": 3.884472329308306e-06, "loss": 0.87097037, "num_input_tokens_seen": 39087370, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.89453125, "step": 1810, "time_per_iteration": 2.3882687091827393 }, { "auxiliary_loss_clip": 0.01127982, "auxiliary_loss_mlp": 0.01052776, "balance_loss_clip": 1.02653766, "balance_loss_mlp": 1.03371501, "epoch": 0.10888321058169247, "flos": 26467829596800.0, "grad_norm": 2.0813929656714656, "language_loss": 0.63652569, "learning_rate": 3.8843457443722195e-06, "loss": 0.6583333, "num_input_tokens_seen": 39106635, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.94140625, "step": 1811, "time_per_iteration": 2.4178736209869385 }, { "auxiliary_loss_clip": 0.01122244, "auxiliary_loss_mlp": 0.01045659, "balance_loss_clip": 1.02147138, "balance_loss_mlp": 1.03190422, "epoch": 0.10894333383436045, "flos": 25847423003520.0, "grad_norm": 2.200120322991929, "language_loss": 0.74163443, "learning_rate": 3.884219092188681e-06, "loss": 0.76331341, "num_input_tokens_seen": 39126335, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.90625, "step": 1812, "time_per_iteration": 2.4389612674713135 }, { "auxiliary_loss_clip": 0.01122401, "auxiliary_loss_mlp": 0.0104553, "balance_loss_clip": 1.02191472, "balance_loss_mlp": 1.03145349, "epoch": 0.10900345708702841, "flos": 19536068862720.0, "grad_norm": 1.67541571038205, "language_loss": 0.72409236, "learning_rate": 3.884092372762209e-06, "loss": 0.74577165, "num_input_tokens_seen": 39144820, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.91015625, "step": 1813, "time_per_iteration": 2.4204447269439697 }, { "auxiliary_loss_clip": 0.01120348, "auxiliary_loss_mlp": 0.01047142, "balance_loss_clip": 1.02408659, "balance_loss_mlp": 1.03367043, "epoch": 0.10906358033969638, "flos": 23622163810560.0, "grad_norm": 1.8415215996899577, "language_loss": 0.82487369, "learning_rate": 3.883965586097327e-06, "loss": 0.84654868, "num_input_tokens_seen": 39165945, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.8671875, "step": 1814, "time_per_iteration": 2.419451951980591 }, { "auxiliary_loss_clip": 0.01123526, "auxiliary_loss_mlp": 0.0104773, "balance_loss_clip": 1.02416265, "balance_loss_mlp": 1.03223526, "epoch": 0.10912370359236434, "flos": 21213680469120.0, "grad_norm": 4.420405023215869, "language_loss": 0.84061807, "learning_rate": 3.88383873219856e-06, "loss": 0.86233068, "num_input_tokens_seen": 39183520, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.9140625, "step": 1815, "time_per_iteration": 2.373497247695923 }, { "auxiliary_loss_clip": 0.0112164, "auxiliary_loss_mlp": 0.01044655, "balance_loss_clip": 1.01982379, "balance_loss_mlp": 1.03311396, "epoch": 0.10918382684503232, "flos": 13552339720320.0, "grad_norm": 4.895839077195346, "language_loss": 0.71816218, "learning_rate": 3.8837118110704345e-06, "loss": 0.73982519, "num_input_tokens_seen": 39201190, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.88671875, "step": 1816, "time_per_iteration": 2.3697516918182373 }, { "auxiliary_loss_clip": 0.01126296, "auxiliary_loss_mlp": 0.01053606, "balance_loss_clip": 1.02730894, "balance_loss_mlp": 1.03459978, "epoch": 0.10924395009770028, "flos": 27963089838720.0, "grad_norm": 2.297323944015786, "language_loss": 0.72977591, "learning_rate": 3.88358482271748e-06, "loss": 0.75157493, "num_input_tokens_seen": 39221210, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.91796875, "step": 1817, "time_per_iteration": 2.427844524383545 }, { "auxiliary_loss_clip": 0.01123589, "auxiliary_loss_mlp": 0.01046912, "balance_loss_clip": 1.02059031, "balance_loss_mlp": 1.0316056, "epoch": 0.10930407335036825, "flos": 25592557011840.0, "grad_norm": 1.665588613083356, "language_loss": 0.67563391, "learning_rate": 3.883457767144228e-06, "loss": 0.69733888, "num_input_tokens_seen": 39242025, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.921875, "step": 1818, "time_per_iteration": 2.426520586013794 }, { "auxiliary_loss_clip": 0.01123798, "auxiliary_loss_mlp": 0.01051526, "balance_loss_clip": 1.02620637, "balance_loss_mlp": 1.03213441, "epoch": 0.10936419660303623, "flos": 18405197147520.0, "grad_norm": 2.4183236947991564, "language_loss": 0.73805034, "learning_rate": 3.883330644355212e-06, "loss": 0.75980365, "num_input_tokens_seen": 39259870, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.9140625, "step": 1819, "time_per_iteration": 2.3530497550964355 }, { "auxiliary_loss_clip": 0.01124492, "auxiliary_loss_mlp": 0.01050427, "balance_loss_clip": 1.02703798, "balance_loss_mlp": 1.03328323, "epoch": 0.1094243198557042, "flos": 23838974553600.0, "grad_norm": 3.5626405489282345, "language_loss": 0.7400474, "learning_rate": 3.8832034543549716e-06, "loss": 0.76179659, "num_input_tokens_seen": 39278500, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.91015625, "step": 1820, "time_per_iteration": 2.417438507080078 }, { "auxiliary_loss_clip": 0.01122324, "auxiliary_loss_mlp": 0.01050176, "balance_loss_clip": 1.02416396, "balance_loss_mlp": 1.03208113, "epoch": 0.10948444310837216, "flos": 14643166239360.0, "grad_norm": 2.4897420477094308, "language_loss": 0.82555467, "learning_rate": 3.883076197148043e-06, "loss": 0.84727973, "num_input_tokens_seen": 39294800, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.90234375, "step": 1821, "time_per_iteration": 2.348799467086792 }, { "auxiliary_loss_clip": 0.01119461, "auxiliary_loss_mlp": 0.01048744, "balance_loss_clip": 1.02570057, "balance_loss_mlp": 1.03041005, "epoch": 0.10954456636104014, "flos": 27817571825280.0, "grad_norm": 2.5166504603033286, "language_loss": 0.76038003, "learning_rate": 3.8829488727389684e-06, "loss": 0.78206205, "num_input_tokens_seen": 39314625, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.890625, "step": 1822, "time_per_iteration": 2.4672350883483887 }, { "auxiliary_loss_clip": 0.01120139, "auxiliary_loss_mlp": 0.01039119, "balance_loss_clip": 1.01685095, "balance_loss_mlp": 1.03165674, "epoch": 0.1096046896137081, "flos": 33619508184960.0, "grad_norm": 1.7645762476545976, "language_loss": 0.79805642, "learning_rate": 3.882821481132294e-06, "loss": 0.81964904, "num_input_tokens_seen": 39336465, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.8828125, "step": 1823, "time_per_iteration": 2.4774742126464844 }, { "auxiliary_loss_clip": 0.01121685, "auxiliary_loss_mlp": 0.01041736, "balance_loss_clip": 1.01870477, "balance_loss_mlp": 1.03323531, "epoch": 0.10966481286637607, "flos": 26978783477760.0, "grad_norm": 1.5088373435187543, "language_loss": 0.79352868, "learning_rate": 3.882694022332562e-06, "loss": 0.81516284, "num_input_tokens_seen": 39357930, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.8828125, "step": 1824, "time_per_iteration": 2.421736717224121 }, { "auxiliary_loss_clip": 0.01121125, "auxiliary_loss_mlp": 0.01048417, "balance_loss_clip": 1.02428925, "balance_loss_mlp": 1.03207612, "epoch": 0.10972493611904403, "flos": 23035518368640.0, "grad_norm": 1.8666654077198064, "language_loss": 0.8807869, "learning_rate": 3.882566496344324e-06, "loss": 0.90248227, "num_input_tokens_seen": 39376380, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.88671875, "step": 1825, "time_per_iteration": 2.3920836448669434 }, { "auxiliary_loss_clip": 0.01116614, "auxiliary_loss_mlp": 0.01045545, "balance_loss_clip": 1.02299047, "balance_loss_mlp": 1.03023314, "epoch": 0.10978505937171201, "flos": 38103194229120.0, "grad_norm": 2.47998846056717, "language_loss": 0.76288664, "learning_rate": 3.88243890317213e-06, "loss": 0.78450823, "num_input_tokens_seen": 39399935, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.8671875, "step": 1826, "time_per_iteration": 2.5153608322143555 }, { "auxiliary_loss_clip": 0.01041664, "auxiliary_loss_mlp": 0.01008415, "balance_loss_clip": 1.00543463, "balance_loss_mlp": 1.01317477, "epoch": 0.10984518262437998, "flos": 59846645802240.0, "grad_norm": 0.8523878617208052, "language_loss": 0.54961729, "learning_rate": 3.882311242820534e-06, "loss": 0.57011807, "num_input_tokens_seen": 39460685, "router_z_loss_clip": 0.02978516, "router_z_loss_mlp": 0.28515625, "step": 1827, "time_per_iteration": 3.0348868370056152 }, { "auxiliary_loss_clip": 0.01038473, "auxiliary_loss_mlp": 0.01004865, "balance_loss_clip": 1.00183761, "balance_loss_mlp": 1.00998783, "epoch": 0.10990530587704794, "flos": 66716295494400.0, "grad_norm": 0.7314426430528725, "language_loss": 0.553303, "learning_rate": 3.882183515294092e-06, "loss": 0.57373631, "num_input_tokens_seen": 39524765, "router_z_loss_clip": 0.03027344, "router_z_loss_mlp": 0.28515625, "step": 1828, "time_per_iteration": 3.0552117824554443 }, { "auxiliary_loss_clip": 0.01125022, "auxiliary_loss_mlp": 0.01047309, "balance_loss_clip": 1.02119052, "balance_loss_mlp": 1.03243375, "epoch": 0.10996542912971592, "flos": 25446026568960.0, "grad_norm": 3.020088416281762, "language_loss": 0.84540451, "learning_rate": 3.882055720597362e-06, "loss": 0.86712778, "num_input_tokens_seen": 39543640, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.92578125, "step": 1829, "time_per_iteration": 2.4032886028289795 }, { "auxiliary_loss_clip": 0.01123522, "auxiliary_loss_mlp": 0.01051236, "balance_loss_clip": 1.02602315, "balance_loss_mlp": 1.03332651, "epoch": 0.11002555238238389, "flos": 44016503425920.0, "grad_norm": 2.1794525035170795, "language_loss": 0.88641047, "learning_rate": 3.8819278587349045e-06, "loss": 0.90815806, "num_input_tokens_seen": 39567525, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.90234375, "step": 1830, "time_per_iteration": 2.5679574012756348 }, { "auxiliary_loss_clip": 0.01034133, "auxiliary_loss_mlp": 0.01003377, "balance_loss_clip": 0.99980056, "balance_loss_mlp": 1.00614595, "epoch": 0.11008567563505185, "flos": 54061781097600.0, "grad_norm": 0.6857460363847238, "language_loss": 0.55485028, "learning_rate": 3.881799929711282e-06, "loss": 0.57522535, "num_input_tokens_seen": 39628470, "router_z_loss_clip": 0.03564453, "router_z_loss_mlp": 0.27929688, "step": 1831, "time_per_iteration": 3.04540753364563 }, { "auxiliary_loss_clip": 0.01129426, "auxiliary_loss_mlp": 0.01050821, "balance_loss_clip": 1.02491689, "balance_loss_mlp": 1.0353601, "epoch": 0.11014579888771983, "flos": 24242011822080.0, "grad_norm": 2.222061058726195, "language_loss": 0.91241372, "learning_rate": 3.881671933531061e-06, "loss": 0.9342162, "num_input_tokens_seen": 39646670, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.94140625, "step": 1832, "time_per_iteration": 3.7834312915802 }, { "auxiliary_loss_clip": 0.01035121, "auxiliary_loss_mlp": 0.01002943, "balance_loss_clip": 0.99962914, "balance_loss_mlp": 1.00814128, "epoch": 0.1102059221403878, "flos": 57740684325120.0, "grad_norm": 0.7083011127659482, "language_loss": 0.59934974, "learning_rate": 3.881543870198809e-06, "loss": 0.61973035, "num_input_tokens_seen": 39712915, "router_z_loss_clip": 0.03320312, "router_z_loss_mlp": 0.26953125, "step": 1833, "time_per_iteration": 4.4285361766815186 }, { "auxiliary_loss_clip": 0.01122688, "auxiliary_loss_mlp": 0.01042234, "balance_loss_clip": 1.0180105, "balance_loss_mlp": 1.03290153, "epoch": 0.11026604539305576, "flos": 16795107843840.0, "grad_norm": 6.5647103964142275, "language_loss": 0.80468589, "learning_rate": 3.881415739719096e-06, "loss": 0.82633519, "num_input_tokens_seen": 39730650, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.8984375, "step": 1834, "time_per_iteration": 2.365412712097168 }, { "auxiliary_loss_clip": 0.01127512, "auxiliary_loss_mlp": 0.01049115, "balance_loss_clip": 1.0236876, "balance_loss_mlp": 1.0368191, "epoch": 0.11032616864572373, "flos": 23986936362240.0, "grad_norm": 3.0585102867786986, "language_loss": 0.90389204, "learning_rate": 3.881287542096494e-06, "loss": 0.92565829, "num_input_tokens_seen": 39751065, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.90625, "step": 1835, "time_per_iteration": 2.432892084121704 }, { "auxiliary_loss_clip": 0.01126549, "auxiliary_loss_mlp": 0.01044755, "balance_loss_clip": 1.02037621, "balance_loss_mlp": 1.03564322, "epoch": 0.1103862918983917, "flos": 19682110546560.0, "grad_norm": 2.1670359810007205, "language_loss": 0.63784945, "learning_rate": 3.881159277335581e-06, "loss": 0.65956241, "num_input_tokens_seen": 39769245, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.90625, "step": 1836, "time_per_iteration": 5.125218868255615 }, { "auxiliary_loss_clip": 0.01123524, "auxiliary_loss_mlp": 0.0104534, "balance_loss_clip": 1.02220166, "balance_loss_mlp": 1.03421807, "epoch": 0.11044641515105967, "flos": 32159510282880.0, "grad_norm": 1.921355031365176, "language_loss": 0.72566742, "learning_rate": 3.88103094544093e-06, "loss": 0.74735606, "num_input_tokens_seen": 39790830, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.890625, "step": 1837, "time_per_iteration": 2.4840970039367676 }, { "auxiliary_loss_clip": 0.01127698, "auxiliary_loss_mlp": 0.01051693, "balance_loss_clip": 1.02830374, "balance_loss_mlp": 1.03508973, "epoch": 0.11050653840372764, "flos": 16688343306240.0, "grad_norm": 2.6567733589711198, "language_loss": 0.7852577, "learning_rate": 3.880902546417125e-06, "loss": 0.80705154, "num_input_tokens_seen": 39809475, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.92578125, "step": 1838, "time_per_iteration": 2.3714590072631836 }, { "auxiliary_loss_clip": 0.01126018, "auxiliary_loss_mlp": 0.01053658, "balance_loss_clip": 1.02975667, "balance_loss_mlp": 1.03530455, "epoch": 0.11056666165639562, "flos": 21207989917440.0, "grad_norm": 1.8341679071863268, "language_loss": 0.71916747, "learning_rate": 3.880774080268745e-06, "loss": 0.74096417, "num_input_tokens_seen": 39826355, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.90625, "step": 1839, "time_per_iteration": 2.373873472213745 }, { "auxiliary_loss_clip": 0.01129867, "auxiliary_loss_mlp": 0.0104687, "balance_loss_clip": 1.02095413, "balance_loss_mlp": 1.0373522, "epoch": 0.11062678490906358, "flos": 19164663152640.0, "grad_norm": 2.2181662106134747, "language_loss": 0.7848084, "learning_rate": 3.880645547000377e-06, "loss": 0.80657578, "num_input_tokens_seen": 39845335, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.92578125, "step": 1840, "time_per_iteration": 2.397270679473877 }, { "auxiliary_loss_clip": 0.0111785, "auxiliary_loss_mlp": 0.0104291, "balance_loss_clip": 1.02027237, "balance_loss_mlp": 1.03210425, "epoch": 0.11068690816173155, "flos": 24894259441920.0, "grad_norm": 1.6079820018701225, "language_loss": 0.87717295, "learning_rate": 3.880516946616606e-06, "loss": 0.89878058, "num_input_tokens_seen": 39865065, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.859375, "step": 1841, "time_per_iteration": 2.405466079711914 }, { "auxiliary_loss_clip": 0.01119553, "auxiliary_loss_mlp": 0.01044184, "balance_loss_clip": 1.02077103, "balance_loss_mlp": 1.03343081, "epoch": 0.11074703141439952, "flos": 16471427829120.0, "grad_norm": 1.962118133830366, "language_loss": 0.90375423, "learning_rate": 3.880388279122023e-06, "loss": 0.92539161, "num_input_tokens_seen": 39882780, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.859375, "step": 1842, "time_per_iteration": 2.362666606903076 }, { "auxiliary_loss_clip": 0.0112011, "auxiliary_loss_mlp": 0.01046667, "balance_loss_clip": 1.02385056, "balance_loss_mlp": 1.03192472, "epoch": 0.11080715466706749, "flos": 19171401045120.0, "grad_norm": 2.339226252612978, "language_loss": 0.85794604, "learning_rate": 3.880259544521219e-06, "loss": 0.87961382, "num_input_tokens_seen": 39900295, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.8828125, "step": 1843, "time_per_iteration": 2.3702476024627686 }, { "auxiliary_loss_clip": 0.01122537, "auxiliary_loss_mlp": 0.01047296, "balance_loss_clip": 1.02326322, "balance_loss_mlp": 1.03436565, "epoch": 0.11086727791973545, "flos": 27703580636160.0, "grad_norm": 1.9907211936404086, "language_loss": 0.74612248, "learning_rate": 3.880130742818789e-06, "loss": 0.76782072, "num_input_tokens_seen": 39922075, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.8828125, "step": 1844, "time_per_iteration": 2.4470207691192627 }, { "auxiliary_loss_clip": 0.01125056, "auxiliary_loss_mlp": 0.01043043, "balance_loss_clip": 1.01850975, "balance_loss_mlp": 1.03429544, "epoch": 0.11092740117240343, "flos": 18513986544000.0, "grad_norm": 2.2664377638723683, "language_loss": 0.75605702, "learning_rate": 3.880001874019328e-06, "loss": 0.77773809, "num_input_tokens_seen": 39940115, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.90625, "step": 1845, "time_per_iteration": 2.352602243423462 }, { "auxiliary_loss_clip": 0.01121297, "auxiliary_loss_mlp": 0.01043874, "balance_loss_clip": 1.0212239, "balance_loss_mlp": 1.03393865, "epoch": 0.1109875244250714, "flos": 20521387653120.0, "grad_norm": 1.6060166262770896, "language_loss": 0.76185834, "learning_rate": 3.879872938127438e-06, "loss": 0.78351009, "num_input_tokens_seen": 39959920, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.875, "step": 1846, "time_per_iteration": 2.3857526779174805 }, { "auxiliary_loss_clip": 0.0112376, "auxiliary_loss_mlp": 0.01043863, "balance_loss_clip": 1.02034295, "balance_loss_mlp": 1.03274524, "epoch": 0.11104764767773936, "flos": 14097787891200.0, "grad_norm": 2.8021533328888744, "language_loss": 0.85970891, "learning_rate": 3.879743935147717e-06, "loss": 0.88138521, "num_input_tokens_seen": 39974755, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.91015625, "step": 1847, "time_per_iteration": 2.3417770862579346 }, { "auxiliary_loss_clip": 0.01122761, "auxiliary_loss_mlp": 0.01049376, "balance_loss_clip": 1.02574825, "balance_loss_mlp": 1.03167677, "epoch": 0.11110777093040733, "flos": 20593483344000.0, "grad_norm": 2.008085509007359, "language_loss": 0.77417588, "learning_rate": 3.87961486508477e-06, "loss": 0.79589725, "num_input_tokens_seen": 39993355, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.91015625, "step": 1848, "time_per_iteration": 2.385127067565918 }, { "auxiliary_loss_clip": 0.01121406, "auxiliary_loss_mlp": 0.01042318, "balance_loss_clip": 1.02090836, "balance_loss_mlp": 1.03614879, "epoch": 0.11116789418307531, "flos": 21869035200000.0, "grad_norm": 2.3025183471435877, "language_loss": 0.77871823, "learning_rate": 3.879485727943204e-06, "loss": 0.80035543, "num_input_tokens_seen": 40012410, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.8515625, "step": 1849, "time_per_iteration": 2.3708243370056152 }, { "auxiliary_loss_clip": 0.01122495, "auxiliary_loss_mlp": 0.01046036, "balance_loss_clip": 1.02386284, "balance_loss_mlp": 1.03078926, "epoch": 0.11122801743574327, "flos": 15522209251200.0, "grad_norm": 3.133443799544841, "language_loss": 0.712363, "learning_rate": 3.879356523727627e-06, "loss": 0.73404837, "num_input_tokens_seen": 40029315, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.91796875, "step": 1850, "time_per_iteration": 2.3596272468566895 }, { "auxiliary_loss_clip": 0.01124561, "auxiliary_loss_mlp": 0.01046046, "balance_loss_clip": 1.02158463, "balance_loss_mlp": 1.03467679, "epoch": 0.11128814068841124, "flos": 14391407358720.0, "grad_norm": 2.085739423799335, "language_loss": 0.81019771, "learning_rate": 3.87922725244265e-06, "loss": 0.8319037, "num_input_tokens_seen": 40045765, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.8984375, "step": 1851, "time_per_iteration": 2.337360382080078 }, { "auxiliary_loss_clip": 0.01121203, "auxiliary_loss_mlp": 0.01045591, "balance_loss_clip": 1.0227859, "balance_loss_mlp": 1.03309608, "epoch": 0.11134826394107922, "flos": 16653011143680.0, "grad_norm": 2.4333394722702217, "language_loss": 0.88124275, "learning_rate": 3.879097914092886e-06, "loss": 0.90291065, "num_input_tokens_seen": 40061660, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.8828125, "step": 1852, "time_per_iteration": 2.357152223587036 }, { "auxiliary_loss_clip": 0.01122967, "auxiliary_loss_mlp": 0.01046162, "balance_loss_clip": 1.02071118, "balance_loss_mlp": 1.03313994, "epoch": 0.11140838719374718, "flos": 16690053962880.0, "grad_norm": 2.310298758811435, "language_loss": 0.72288007, "learning_rate": 3.878968508682952e-06, "loss": 0.74457133, "num_input_tokens_seen": 40080180, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.8984375, "step": 1853, "time_per_iteration": 2.3621346950531006 }, { "auxiliary_loss_clip": 0.0103504, "auxiliary_loss_mlp": 0.01007875, "balance_loss_clip": 1.00453675, "balance_loss_mlp": 1.00779462, "epoch": 0.11146851044641515, "flos": 60973397976960.0, "grad_norm": 0.7850775594182413, "language_loss": 0.53635836, "learning_rate": 3.878839036217464e-06, "loss": 0.55678749, "num_input_tokens_seen": 40138910, "router_z_loss_clip": 0.03344727, "router_z_loss_mlp": 0.2734375, "step": 1854, "time_per_iteration": 2.982922315597534 }, { "auxiliary_loss_clip": 0.01130098, "auxiliary_loss_mlp": 0.01046836, "balance_loss_clip": 1.02103877, "balance_loss_mlp": 1.03421712, "epoch": 0.11152863369908313, "flos": 22192924682880.0, "grad_norm": 2.521424876635544, "language_loss": 0.84896588, "learning_rate": 3.878709496701045e-06, "loss": 0.87073517, "num_input_tokens_seen": 40157745, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.95703125, "step": 1855, "time_per_iteration": 2.3863637447357178 }, { "auxiliary_loss_clip": 0.01120315, "auxiliary_loss_mlp": 0.0104773, "balance_loss_clip": 1.02462673, "balance_loss_mlp": 1.03160822, "epoch": 0.11158875695175109, "flos": 19536487799040.0, "grad_norm": 2.214380250338141, "language_loss": 0.81937933, "learning_rate": 3.8785798901383155e-06, "loss": 0.84105986, "num_input_tokens_seen": 40175375, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.88671875, "step": 1856, "time_per_iteration": 2.386831521987915 }, { "auxiliary_loss_clip": 0.0112035, "auxiliary_loss_mlp": 0.01044205, "balance_loss_clip": 1.02087581, "balance_loss_mlp": 1.03325868, "epoch": 0.11164888020441906, "flos": 25441662648960.0, "grad_norm": 2.387040860028858, "language_loss": 0.83130205, "learning_rate": 3.878450216533902e-06, "loss": 0.85294759, "num_input_tokens_seen": 40195715, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.87109375, "step": 1857, "time_per_iteration": 2.4100728034973145 }, { "auxiliary_loss_clip": 0.01122763, "auxiliary_loss_mlp": 0.01041734, "balance_loss_clip": 1.01819038, "balance_loss_mlp": 1.0308063, "epoch": 0.11170900345708702, "flos": 15631836520320.0, "grad_norm": 2.108834383109374, "language_loss": 0.82937002, "learning_rate": 3.878320475892433e-06, "loss": 0.85101503, "num_input_tokens_seen": 40213975, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.921875, "step": 1858, "time_per_iteration": 2.378499984741211 }, { "auxiliary_loss_clip": 0.0112419, "auxiliary_loss_mlp": 0.01053308, "balance_loss_clip": 1.02928758, "balance_loss_mlp": 1.03413117, "epoch": 0.111769126709755, "flos": 23038311277440.0, "grad_norm": 2.439230848213365, "language_loss": 0.91331965, "learning_rate": 3.878190668218537e-06, "loss": 0.93509459, "num_input_tokens_seen": 40233905, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.8984375, "step": 1859, "time_per_iteration": 2.3891844749450684 }, { "auxiliary_loss_clip": 0.01121581, "auxiliary_loss_mlp": 0.01046617, "balance_loss_clip": 1.02209592, "balance_loss_mlp": 1.03094959, "epoch": 0.11182924996242297, "flos": 20849641056000.0, "grad_norm": 2.1701555389504694, "language_loss": 0.81527424, "learning_rate": 3.878060793516847e-06, "loss": 0.83695626, "num_input_tokens_seen": 40252810, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.90625, "step": 1860, "time_per_iteration": 2.385209798812866 }, { "auxiliary_loss_clip": 0.01117795, "auxiliary_loss_mlp": 0.01048985, "balance_loss_clip": 1.02597761, "balance_loss_mlp": 1.03135908, "epoch": 0.11188937321509093, "flos": 17454407558400.0, "grad_norm": 4.339887688424837, "language_loss": 0.74721307, "learning_rate": 3.8779308517919995e-06, "loss": 0.76888084, "num_input_tokens_seen": 40272000, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.8671875, "step": 1861, "time_per_iteration": 2.3749191761016846 }, { "auxiliary_loss_clip": 0.01121004, "auxiliary_loss_mlp": 0.01039688, "balance_loss_clip": 1.01765776, "balance_loss_mlp": 1.03207588, "epoch": 0.11194949646775891, "flos": 24094818063360.0, "grad_norm": 1.8191287058658938, "language_loss": 0.88780928, "learning_rate": 3.87780084304863e-06, "loss": 0.9094162, "num_input_tokens_seen": 40290660, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.890625, "step": 1862, "time_per_iteration": 2.390875816345215 }, { "auxiliary_loss_clip": 0.01122035, "auxiliary_loss_mlp": 0.01049775, "balance_loss_clip": 1.02638614, "balance_loss_mlp": 1.03333008, "epoch": 0.11200961972042688, "flos": 25152756215040.0, "grad_norm": 2.4417652901407396, "language_loss": 0.86999977, "learning_rate": 3.877670767291379e-06, "loss": 0.89171791, "num_input_tokens_seen": 40307820, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.88671875, "step": 1863, "time_per_iteration": 2.3952221870422363 }, { "auxiliary_loss_clip": 0.01123151, "auxiliary_loss_mlp": 0.01043867, "balance_loss_clip": 1.01883328, "balance_loss_mlp": 1.03344309, "epoch": 0.11206974297309484, "flos": 21287242437120.0, "grad_norm": 1.7716005915019037, "language_loss": 0.63989413, "learning_rate": 3.877540624524888e-06, "loss": 0.66156435, "num_input_tokens_seen": 40327430, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.8984375, "step": 1864, "time_per_iteration": 2.387942314147949 }, { "auxiliary_loss_clip": 0.01121606, "auxiliary_loss_mlp": 0.01043734, "balance_loss_clip": 1.02109623, "balance_loss_mlp": 1.0336833, "epoch": 0.11212986622576282, "flos": 18914998953600.0, "grad_norm": 2.8383906767286464, "language_loss": 0.74338508, "learning_rate": 3.877410414753802e-06, "loss": 0.76503849, "num_input_tokens_seen": 40344545, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.87890625, "step": 1865, "time_per_iteration": 2.3607590198516846 }, { "auxiliary_loss_clip": 0.0112189, "auxiliary_loss_mlp": 0.01046953, "balance_loss_clip": 1.02070308, "balance_loss_mlp": 1.03154969, "epoch": 0.11218998947843078, "flos": 22053655802880.0, "grad_norm": 9.87993134600205, "language_loss": 0.84361953, "learning_rate": 3.877280137982767e-06, "loss": 0.86530793, "num_input_tokens_seen": 40362300, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.90234375, "step": 1866, "time_per_iteration": 2.3785462379455566 }, { "auxiliary_loss_clip": 0.01120683, "auxiliary_loss_mlp": 0.01045652, "balance_loss_clip": 1.02070165, "balance_loss_mlp": 1.03133845, "epoch": 0.11225011273109875, "flos": 24570544515840.0, "grad_norm": 1.7207529171668403, "language_loss": 0.81263578, "learning_rate": 3.877149794216433e-06, "loss": 0.83429909, "num_input_tokens_seen": 40384720, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.89453125, "step": 1867, "time_per_iteration": 2.4535868167877197 }, { "auxiliary_loss_clip": 0.01124865, "auxiliary_loss_mlp": 0.01051785, "balance_loss_clip": 1.02877796, "balance_loss_mlp": 1.03491139, "epoch": 0.11231023598376672, "flos": 28437419836800.0, "grad_norm": 2.0254021408977803, "language_loss": 0.86644781, "learning_rate": 3.877019383459451e-06, "loss": 0.88821429, "num_input_tokens_seen": 40404000, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.8984375, "step": 1868, "time_per_iteration": 2.432866334915161 }, { "auxiliary_loss_clip": 0.01122161, "auxiliary_loss_mlp": 0.01043699, "balance_loss_clip": 1.02059674, "balance_loss_mlp": 1.03400004, "epoch": 0.1123703592364347, "flos": 14425657269120.0, "grad_norm": 2.5856270805718995, "language_loss": 0.68023825, "learning_rate": 3.876888905716476e-06, "loss": 0.70189679, "num_input_tokens_seen": 40418665, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.87890625, "step": 1869, "time_per_iteration": 2.3502705097198486 }, { "auxiliary_loss_clip": 0.01125969, "auxiliary_loss_mlp": 0.01052933, "balance_loss_clip": 1.02717185, "balance_loss_mlp": 1.03294349, "epoch": 0.11243048248910266, "flos": 22235204206080.0, "grad_norm": 1.536104041632161, "language_loss": 0.77442759, "learning_rate": 3.876758360992165e-06, "loss": 0.79621661, "num_input_tokens_seen": 40437870, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.9296875, "step": 1870, "time_per_iteration": 2.3861958980560303 }, { "auxiliary_loss_clip": 0.01123982, "auxiliary_loss_mlp": 0.01045221, "balance_loss_clip": 1.02044964, "balance_loss_mlp": 1.03114092, "epoch": 0.11249060574177062, "flos": 18583289326080.0, "grad_norm": 2.2165975537900806, "language_loss": 0.7623505, "learning_rate": 3.8766277492911736e-06, "loss": 0.7840426, "num_input_tokens_seen": 40455570, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.92578125, "step": 1871, "time_per_iteration": 3.832282781600952 }, { "auxiliary_loss_clip": 0.0112357, "auxiliary_loss_mlp": 0.01040434, "balance_loss_clip": 1.01706851, "balance_loss_mlp": 1.03323805, "epoch": 0.1125507289944386, "flos": 22855471153920.0, "grad_norm": 1.9322766154803015, "language_loss": 0.81456953, "learning_rate": 3.876497070618166e-06, "loss": 0.83620954, "num_input_tokens_seen": 40473600, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.90234375, "step": 1872, "time_per_iteration": 2.3943119049072266 }, { "auxiliary_loss_clip": 0.01125733, "auxiliary_loss_mlp": 0.01052084, "balance_loss_clip": 1.02839732, "balance_loss_mlp": 1.03431916, "epoch": 0.11261085224710657, "flos": 19675547210880.0, "grad_norm": 2.3744857363701612, "language_loss": 0.82998043, "learning_rate": 3.876366324977806e-06, "loss": 0.8517586, "num_input_tokens_seen": 40490025, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.9140625, "step": 1873, "time_per_iteration": 3.711221933364868 }, { "auxiliary_loss_clip": 0.01125359, "auxiliary_loss_mlp": 0.01049416, "balance_loss_clip": 1.02316654, "balance_loss_mlp": 1.03108621, "epoch": 0.11267097549977453, "flos": 26062173976320.0, "grad_norm": 1.8842552987423473, "language_loss": 0.92325759, "learning_rate": 3.876235512374757e-06, "loss": 0.94500536, "num_input_tokens_seen": 40511580, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.94140625, "step": 1874, "time_per_iteration": 2.4347848892211914 }, { "auxiliary_loss_clip": 0.01119011, "auxiliary_loss_mlp": 0.01047171, "balance_loss_clip": 1.02326918, "balance_loss_mlp": 1.03145373, "epoch": 0.11273109875244251, "flos": 21067010380800.0, "grad_norm": 1.4604694796120459, "language_loss": 0.7536639, "learning_rate": 3.876104632813689e-06, "loss": 0.77532566, "num_input_tokens_seen": 40530155, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.875, "step": 1875, "time_per_iteration": 5.135155439376831 }, { "auxiliary_loss_clip": 0.01120029, "auxiliary_loss_mlp": 0.01045988, "balance_loss_clip": 1.02455413, "balance_loss_mlp": 1.03409672, "epoch": 0.11279122200511048, "flos": 27087782342400.0, "grad_norm": 2.0494182757181982, "language_loss": 0.71384954, "learning_rate": 3.875973686299272e-06, "loss": 0.73550969, "num_input_tokens_seen": 40549500, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.859375, "step": 1876, "time_per_iteration": 2.4182865619659424 }, { "auxiliary_loss_clip": 0.01121093, "auxiliary_loss_mlp": 0.01044036, "balance_loss_clip": 1.02175617, "balance_loss_mlp": 1.03341925, "epoch": 0.11285134525777844, "flos": 20187024762240.0, "grad_norm": 1.8334081916707283, "language_loss": 0.7652418, "learning_rate": 3.875842672836182e-06, "loss": 0.78689313, "num_input_tokens_seen": 40567475, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.875, "step": 1877, "time_per_iteration": 2.3859004974365234 }, { "auxiliary_loss_clip": 0.0112174, "auxiliary_loss_mlp": 0.01055278, "balance_loss_clip": 1.02965963, "balance_loss_mlp": 1.03242016, "epoch": 0.11291146851044641, "flos": 12457638040320.0, "grad_norm": 2.5233166777136145, "language_loss": 0.87412786, "learning_rate": 3.87571159242909e-06, "loss": 0.89589804, "num_input_tokens_seen": 40583280, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.890625, "step": 1878, "time_per_iteration": 2.3492658138275146 }, { "auxiliary_loss_clip": 0.01121535, "auxiliary_loss_mlp": 0.01042434, "balance_loss_clip": 1.01773417, "balance_loss_mlp": 1.03220189, "epoch": 0.11297159176311439, "flos": 23841173969280.0, "grad_norm": 2.1187437654021233, "language_loss": 0.80941617, "learning_rate": 3.875580445082677e-06, "loss": 0.83105588, "num_input_tokens_seen": 40603080, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.89453125, "step": 1879, "time_per_iteration": 2.4078209400177 }, { "auxiliary_loss_clip": 0.0112127, "auxiliary_loss_mlp": 0.0105136, "balance_loss_clip": 1.02637398, "balance_loss_mlp": 1.03222597, "epoch": 0.11303171501578235, "flos": 29929363499520.0, "grad_norm": 2.0738310531410757, "language_loss": 0.69966519, "learning_rate": 3.875449230801622e-06, "loss": 0.72139156, "num_input_tokens_seen": 40623255, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.890625, "step": 1880, "time_per_iteration": 2.4354565143585205 }, { "auxiliary_loss_clip": 0.01121064, "auxiliary_loss_mlp": 0.01045815, "balance_loss_clip": 1.0205431, "balance_loss_mlp": 1.03177834, "epoch": 0.11309183826845032, "flos": 16179623752320.0, "grad_norm": 1.6998560748807998, "language_loss": 0.71996421, "learning_rate": 3.875317949590609e-06, "loss": 0.74163306, "num_input_tokens_seen": 40641570, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.89453125, "step": 1881, "time_per_iteration": 2.3698432445526123 }, { "auxiliary_loss_clip": 0.01122401, "auxiliary_loss_mlp": 0.01048771, "balance_loss_clip": 1.02403498, "balance_loss_mlp": 1.03180218, "epoch": 0.1131519615211183, "flos": 12019897013760.0, "grad_norm": 2.16034106561837, "language_loss": 0.74119371, "learning_rate": 3.875186601454322e-06, "loss": 0.76290548, "num_input_tokens_seen": 40658775, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.90625, "step": 1882, "time_per_iteration": 2.3546245098114014 }, { "auxiliary_loss_clip": 0.01119348, "auxiliary_loss_mlp": 0.01047118, "balance_loss_clip": 1.0215472, "balance_loss_mlp": 1.03185344, "epoch": 0.11321208477378626, "flos": 26248924172160.0, "grad_norm": 2.046400534186846, "language_loss": 0.79340416, "learning_rate": 3.8750551863974484e-06, "loss": 0.81506884, "num_input_tokens_seen": 40679555, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.875, "step": 1883, "time_per_iteration": 2.4280452728271484 }, { "auxiliary_loss_clip": 0.01121617, "auxiliary_loss_mlp": 0.01044822, "balance_loss_clip": 1.01896548, "balance_loss_mlp": 1.03018355, "epoch": 0.11327220802645423, "flos": 13625517663360.0, "grad_norm": 2.5416306163806515, "language_loss": 0.77227646, "learning_rate": 3.874923704424679e-06, "loss": 0.79394084, "num_input_tokens_seen": 40697295, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.9140625, "step": 1884, "time_per_iteration": 2.3503458499908447 }, { "auxiliary_loss_clip": 0.01040015, "auxiliary_loss_mlp": 0.01006052, "balance_loss_clip": 1.00242758, "balance_loss_mlp": 1.01261711, "epoch": 0.1133323312791222, "flos": 57188672818560.0, "grad_norm": 0.7855083275624904, "language_loss": 0.55201423, "learning_rate": 3.8747921555407045e-06, "loss": 0.57247484, "num_input_tokens_seen": 40758095, "router_z_loss_clip": 0.03613281, "router_z_loss_mlp": 0.2734375, "step": 1885, "time_per_iteration": 2.9273834228515625 }, { "auxiliary_loss_clip": 0.01113865, "auxiliary_loss_mlp": 0.01042374, "balance_loss_clip": 1.01996267, "balance_loss_mlp": 1.03008294, "epoch": 0.11339245453179017, "flos": 24350591750400.0, "grad_norm": 1.9656073476571887, "language_loss": 0.90563154, "learning_rate": 3.874660539750222e-06, "loss": 0.92719388, "num_input_tokens_seen": 40777140, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.83984375, "step": 1886, "time_per_iteration": 2.406250476837158 }, { "auxiliary_loss_clip": 0.01123372, "auxiliary_loss_mlp": 0.01042115, "balance_loss_clip": 1.01962066, "balance_loss_mlp": 1.03481531, "epoch": 0.11345257778445814, "flos": 22669698476160.0, "grad_norm": 1.9569472103942396, "language_loss": 0.85377294, "learning_rate": 3.874528857057926e-06, "loss": 0.87542778, "num_input_tokens_seen": 40797505, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.88671875, "step": 1887, "time_per_iteration": 2.4290239810943604 }, { "auxiliary_loss_clip": 0.01123608, "auxiliary_loss_mlp": 0.01048515, "balance_loss_clip": 1.02505445, "balance_loss_mlp": 1.03379381, "epoch": 0.11351270103712612, "flos": 20987408747520.0, "grad_norm": 3.57150940087995, "language_loss": 0.75795519, "learning_rate": 3.874397107468516e-06, "loss": 0.77967644, "num_input_tokens_seen": 40812970, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.8984375, "step": 1888, "time_per_iteration": 2.40659236907959 }, { "auxiliary_loss_clip": 0.01125696, "auxiliary_loss_mlp": 0.01049427, "balance_loss_clip": 1.02421427, "balance_loss_mlp": 1.03497028, "epoch": 0.11357282428979408, "flos": 37346241841920.0, "grad_norm": 2.520445784871137, "language_loss": 0.68051779, "learning_rate": 3.874265290986696e-06, "loss": 0.70226902, "num_input_tokens_seen": 40837745, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.90625, "step": 1889, "time_per_iteration": 2.5671088695526123 }, { "auxiliary_loss_clip": 0.01123114, "auxiliary_loss_mlp": 0.01041424, "balance_loss_clip": 1.01773691, "balance_loss_mlp": 1.03432798, "epoch": 0.11363294754246205, "flos": 21756091351680.0, "grad_norm": 2.475836393648607, "language_loss": 0.8416034, "learning_rate": 3.874133407617169e-06, "loss": 0.86324883, "num_input_tokens_seen": 40856490, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.88671875, "step": 1890, "time_per_iteration": 2.384964942932129 }, { "auxiliary_loss_clip": 0.01118093, "auxiliary_loss_mlp": 0.01043316, "balance_loss_clip": 1.02026057, "balance_loss_mlp": 1.03270805, "epoch": 0.11369307079513001, "flos": 22600535339520.0, "grad_norm": 2.160870064589821, "language_loss": 0.64799368, "learning_rate": 3.874001457364642e-06, "loss": 0.66960776, "num_input_tokens_seen": 40874070, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.8515625, "step": 1891, "time_per_iteration": 2.3964099884033203 }, { "auxiliary_loss_clip": 0.01121113, "auxiliary_loss_mlp": 0.01038546, "balance_loss_clip": 1.01606369, "balance_loss_mlp": 1.03340304, "epoch": 0.11375319404779799, "flos": 21943190661120.0, "grad_norm": 2.5961754883451422, "language_loss": 0.8853538, "learning_rate": 3.873869440233822e-06, "loss": 0.90695035, "num_input_tokens_seen": 40892425, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.875, "step": 1892, "time_per_iteration": 2.4157590866088867 }, { "auxiliary_loss_clip": 0.01123236, "auxiliary_loss_mlp": 0.01056673, "balance_loss_clip": 1.03193736, "balance_loss_mlp": 1.03481007, "epoch": 0.11381331730046595, "flos": 26394267628800.0, "grad_norm": 2.38988995888122, "language_loss": 0.73289359, "learning_rate": 3.8737373562294225e-06, "loss": 0.75469267, "num_input_tokens_seen": 40912190, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.8828125, "step": 1893, "time_per_iteration": 2.435520887374878 }, { "auxiliary_loss_clip": 0.01119479, "auxiliary_loss_mlp": 0.01054012, "balance_loss_clip": 1.02993214, "balance_loss_mlp": 1.03295314, "epoch": 0.11387344055313392, "flos": 23803607479680.0, "grad_norm": 2.002772720280015, "language_loss": 0.7954644, "learning_rate": 3.873605205356157e-06, "loss": 0.81719935, "num_input_tokens_seen": 40928395, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.8671875, "step": 1894, "time_per_iteration": 2.38785982131958 }, { "auxiliary_loss_clip": 0.01122086, "auxiliary_loss_mlp": 0.01047838, "balance_loss_clip": 1.02388895, "balance_loss_mlp": 1.03158116, "epoch": 0.1139335638058019, "flos": 34521699294720.0, "grad_norm": 5.640230676070867, "language_loss": 0.80075616, "learning_rate": 3.873472987618742e-06, "loss": 0.82245541, "num_input_tokens_seen": 40946555, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.90625, "step": 1895, "time_per_iteration": 2.4901442527770996 }, { "auxiliary_loss_clip": 0.01037659, "auxiliary_loss_mlp": 0.01010939, "balance_loss_clip": 1.00774467, "balance_loss_mlp": 1.01096821, "epoch": 0.11399368705846986, "flos": 70584148333440.0, "grad_norm": 0.799269048333181, "language_loss": 0.63373232, "learning_rate": 3.873340703021894e-06, "loss": 0.65421826, "num_input_tokens_seen": 41004910, "router_z_loss_clip": 0.03198242, "router_z_loss_mlp": 0.26757812, "step": 1896, "time_per_iteration": 3.1031527519226074 }, { "auxiliary_loss_clip": 0.01120183, "auxiliary_loss_mlp": 0.0105012, "balance_loss_clip": 1.02465725, "balance_loss_mlp": 1.03358936, "epoch": 0.11405381031113783, "flos": 21323203004160.0, "grad_norm": 1.8792429588436772, "language_loss": 0.84862256, "learning_rate": 3.873208351570335e-06, "loss": 0.87032557, "num_input_tokens_seen": 41026385, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.8671875, "step": 1897, "time_per_iteration": 2.421992540359497 }, { "auxiliary_loss_clip": 0.01120028, "auxiliary_loss_mlp": 0.01045303, "balance_loss_clip": 1.02270079, "balance_loss_mlp": 1.0324626, "epoch": 0.11411393356380581, "flos": 19718594784000.0, "grad_norm": 2.787889135189672, "language_loss": 0.79151994, "learning_rate": 3.873075933268788e-06, "loss": 0.81317323, "num_input_tokens_seen": 41045315, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.875, "step": 1898, "time_per_iteration": 2.4150617122650146 }, { "auxiliary_loss_clip": 0.0112114, "auxiliary_loss_mlp": 0.01050052, "balance_loss_clip": 1.02486324, "balance_loss_mlp": 1.0317378, "epoch": 0.11417405681647377, "flos": 17529470714880.0, "grad_norm": 2.0055423221469075, "language_loss": 0.73206705, "learning_rate": 3.87294344812198e-06, "loss": 0.75377893, "num_input_tokens_seen": 41063390, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.89453125, "step": 1899, "time_per_iteration": 2.3959221839904785 }, { "auxiliary_loss_clip": 0.01034788, "auxiliary_loss_mlp": 0.01003222, "balance_loss_clip": 0.99971706, "balance_loss_mlp": 1.00755334, "epoch": 0.11423418006914174, "flos": 59671416355200.0, "grad_norm": 0.9087447150687288, "language_loss": 0.63396221, "learning_rate": 3.8728108961346386e-06, "loss": 0.65434235, "num_input_tokens_seen": 41124180, "router_z_loss_clip": 0.03515625, "router_z_loss_mlp": 0.2734375, "step": 1900, "time_per_iteration": 2.9975342750549316 }, { "auxiliary_loss_clip": 0.01122627, "auxiliary_loss_mlp": 0.01046789, "balance_loss_clip": 1.02257764, "balance_loss_mlp": 1.03412795, "epoch": 0.1142943033218097, "flos": 22962096046080.0, "grad_norm": 1.662102926602138, "language_loss": 0.78009129, "learning_rate": 3.872678277311493e-06, "loss": 0.80178547, "num_input_tokens_seen": 41143485, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.88671875, "step": 1901, "time_per_iteration": 2.405158758163452 }, { "auxiliary_loss_clip": 0.01121845, "auxiliary_loss_mlp": 0.01041315, "balance_loss_clip": 1.0184269, "balance_loss_mlp": 1.0350672, "epoch": 0.11435442657447768, "flos": 18255385036800.0, "grad_norm": 2.0287733645949926, "language_loss": 0.83728218, "learning_rate": 3.872545591657276e-06, "loss": 0.85891378, "num_input_tokens_seen": 41161695, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.8671875, "step": 1902, "time_per_iteration": 2.3576552867889404 }, { "auxiliary_loss_clip": 0.01117738, "auxiliary_loss_mlp": 0.0104449, "balance_loss_clip": 1.01995707, "balance_loss_mlp": 1.0303036, "epoch": 0.11441454982714565, "flos": 24060044482560.0, "grad_norm": 1.6977257217677675, "language_loss": 0.77722776, "learning_rate": 3.872412839176725e-06, "loss": 0.79885, "num_input_tokens_seen": 41181715, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.875, "step": 1903, "time_per_iteration": 2.4129691123962402 }, { "auxiliary_loss_clip": 0.0112092, "auxiliary_loss_mlp": 0.01038833, "balance_loss_clip": 1.01737499, "balance_loss_mlp": 1.03335369, "epoch": 0.11447467307981361, "flos": 25336538945280.0, "grad_norm": 2.289445239864963, "language_loss": 0.75533634, "learning_rate": 3.872280019874576e-06, "loss": 0.77693391, "num_input_tokens_seen": 41201770, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.875, "step": 1904, "time_per_iteration": 2.4143829345703125 }, { "auxiliary_loss_clip": 0.01118086, "auxiliary_loss_mlp": 0.01044327, "balance_loss_clip": 1.01951957, "balance_loss_mlp": 1.03191447, "epoch": 0.11453479633248159, "flos": 21724983463680.0, "grad_norm": 2.360565416980462, "language_loss": 0.91935968, "learning_rate": 3.872147133755568e-06, "loss": 0.94098371, "num_input_tokens_seen": 41220590, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.86328125, "step": 1905, "time_per_iteration": 2.3952243328094482 }, { "auxiliary_loss_clip": 0.01121045, "auxiliary_loss_mlp": 0.01047477, "balance_loss_clip": 1.02266955, "balance_loss_mlp": 1.03007984, "epoch": 0.11459491958514956, "flos": 12968871212160.0, "grad_norm": 2.7913693138781923, "language_loss": 0.77344108, "learning_rate": 3.872014180824446e-06, "loss": 0.79512632, "num_input_tokens_seen": 41237250, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.91015625, "step": 1906, "time_per_iteration": 2.3560922145843506 }, { "auxiliary_loss_clip": 0.01119823, "auxiliary_loss_mlp": 0.01049127, "balance_loss_clip": 1.02524948, "balance_loss_mlp": 1.03317046, "epoch": 0.11465504283781752, "flos": 22710162608640.0, "grad_norm": 11.269839278915923, "language_loss": 0.81792992, "learning_rate": 3.8718811610859526e-06, "loss": 0.83961946, "num_input_tokens_seen": 41256680, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.8671875, "step": 1907, "time_per_iteration": 2.3930797576904297 }, { "auxiliary_loss_clip": 0.01120222, "auxiliary_loss_mlp": 0.01054948, "balance_loss_clip": 1.03223836, "balance_loss_mlp": 1.03404033, "epoch": 0.1147151660904855, "flos": 23397428188800.0, "grad_norm": 2.608949679238145, "language_loss": 0.84991479, "learning_rate": 3.8717480745448356e-06, "loss": 0.87166649, "num_input_tokens_seen": 41270955, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.86328125, "step": 1908, "time_per_iteration": 2.362637996673584 }, { "auxiliary_loss_clip": 0.01034183, "auxiliary_loss_mlp": 0.01003795, "balance_loss_clip": 1.00045669, "balance_loss_mlp": 1.00746334, "epoch": 0.11477528934315347, "flos": 63009044242560.0, "grad_norm": 0.9166563959521401, "language_loss": 0.60988611, "learning_rate": 3.871614921205845e-06, "loss": 0.63026589, "num_input_tokens_seen": 41319180, "router_z_loss_clip": 0.03344727, "router_z_loss_mlp": 0.26757812, "step": 1909, "time_per_iteration": 2.77168607711792 }, { "auxiliary_loss_clip": 0.01121819, "auxiliary_loss_mlp": 0.01041726, "balance_loss_clip": 1.01943362, "balance_loss_mlp": 1.0347178, "epoch": 0.11483541259582143, "flos": 16324687918080.0, "grad_norm": 1.8870721212084607, "language_loss": 0.78994447, "learning_rate": 3.871481701073731e-06, "loss": 0.81157988, "num_input_tokens_seen": 41337480, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.87109375, "step": 1910, "time_per_iteration": 2.3642501831054688 }, { "auxiliary_loss_clip": 0.01123226, "auxiliary_loss_mlp": 0.01042623, "balance_loss_clip": 1.02014017, "balance_loss_mlp": 1.03540301, "epoch": 0.1148955358484894, "flos": 21579325804800.0, "grad_norm": 2.1832236962668934, "language_loss": 0.77382857, "learning_rate": 3.8713484141532505e-06, "loss": 0.79548711, "num_input_tokens_seen": 41354650, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.87890625, "step": 1911, "time_per_iteration": 3.7580933570861816 }, { "auxiliary_loss_clip": 0.01116111, "auxiliary_loss_mlp": 0.01042138, "balance_loss_clip": 1.01986945, "balance_loss_mlp": 1.03272152, "epoch": 0.11495565910115738, "flos": 27672437836800.0, "grad_norm": 1.8401151809725036, "language_loss": 0.79115731, "learning_rate": 3.871215060449158e-06, "loss": 0.81273973, "num_input_tokens_seen": 41376935, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.8359375, "step": 1912, "time_per_iteration": 2.4457361698150635 }, { "auxiliary_loss_clip": 0.01116913, "auxiliary_loss_mlp": 0.01054898, "balance_loss_clip": 1.03103209, "balance_loss_mlp": 1.03193891, "epoch": 0.11501578235382534, "flos": 20631294213120.0, "grad_norm": 1.8881752293686607, "language_loss": 0.77768546, "learning_rate": 3.871081639966213e-06, "loss": 0.79940355, "num_input_tokens_seen": 41396105, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.8515625, "step": 1913, "time_per_iteration": 3.730783224105835 }, { "auxiliary_loss_clip": 0.01120003, "auxiliary_loss_mlp": 0.01040292, "balance_loss_clip": 1.01674795, "balance_loss_mlp": 1.03115487, "epoch": 0.1150759056064933, "flos": 19828012584960.0, "grad_norm": 2.030156090053584, "language_loss": 0.7035594, "learning_rate": 3.870948152709178e-06, "loss": 0.72516233, "num_input_tokens_seen": 41415600, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.890625, "step": 1914, "time_per_iteration": 2.431093215942383 }, { "auxiliary_loss_clip": 0.0103309, "auxiliary_loss_mlp": 0.01008316, "balance_loss_clip": 1.00535917, "balance_loss_mlp": 1.0072974, "epoch": 0.11513602885916129, "flos": 70041981830400.0, "grad_norm": 0.7608967370047242, "language_loss": 0.61050045, "learning_rate": 3.870814598682816e-06, "loss": 0.63091445, "num_input_tokens_seen": 41478760, "router_z_loss_clip": 0.02954102, "router_z_loss_mlp": 0.2578125, "step": 1915, "time_per_iteration": 5.959998846054077 }, { "auxiliary_loss_clip": 0.01121487, "auxiliary_loss_mlp": 0.01043383, "balance_loss_clip": 1.01954126, "balance_loss_mlp": 1.03480065, "epoch": 0.11519615211182925, "flos": 15740835384960.0, "grad_norm": 6.53600990937075, "language_loss": 0.92811406, "learning_rate": 3.8706809778918935e-06, "loss": 0.94976276, "num_input_tokens_seen": 41495720, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.8671875, "step": 1916, "time_per_iteration": 2.364123821258545 }, { "auxiliary_loss_clip": 0.01118829, "auxiliary_loss_mlp": 0.01047289, "balance_loss_clip": 1.02338719, "balance_loss_mlp": 1.0321542, "epoch": 0.11525627536449722, "flos": 20666591464320.0, "grad_norm": 1.9052463671782878, "language_loss": 0.72640043, "learning_rate": 3.870547290341179e-06, "loss": 0.74806166, "num_input_tokens_seen": 41513585, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.8671875, "step": 1917, "time_per_iteration": 2.4193167686462402 }, { "auxiliary_loss_clip": 0.01118869, "auxiliary_loss_mlp": 0.0103757, "balance_loss_clip": 1.01358509, "balance_loss_mlp": 1.03429604, "epoch": 0.1153163986171652, "flos": 20302237848960.0, "grad_norm": 2.388180552467478, "language_loss": 0.74289095, "learning_rate": 3.870413536035442e-06, "loss": 0.76445532, "num_input_tokens_seen": 41533390, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.84375, "step": 1918, "time_per_iteration": 2.3907649517059326 }, { "auxiliary_loss_clip": 0.01121428, "auxiliary_loss_mlp": 0.01037743, "balance_loss_clip": 1.01305485, "balance_loss_mlp": 1.03278852, "epoch": 0.11537652186983316, "flos": 17638364845440.0, "grad_norm": 2.183824722391978, "language_loss": 0.86369371, "learning_rate": 3.870279714979458e-06, "loss": 0.88528538, "num_input_tokens_seen": 41551015, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.88671875, "step": 1919, "time_per_iteration": 2.3742287158966064 }, { "auxiliary_loss_clip": 0.01117779, "auxiliary_loss_mlp": 0.01045815, "balance_loss_clip": 1.02177036, "balance_loss_mlp": 1.03159249, "epoch": 0.11543664512250112, "flos": 21068337012480.0, "grad_norm": 3.6738136039291676, "language_loss": 0.86615455, "learning_rate": 3.870145827178002e-06, "loss": 0.8877905, "num_input_tokens_seen": 41568055, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.859375, "step": 1920, "time_per_iteration": 2.3817317485809326 }, { "auxiliary_loss_clip": 0.01117626, "auxiliary_loss_mlp": 0.01040966, "balance_loss_clip": 1.01758945, "balance_loss_mlp": 1.03209615, "epoch": 0.11549676837516909, "flos": 22746437377920.0, "grad_norm": 2.0350494362644977, "language_loss": 0.79077518, "learning_rate": 3.8700118726358525e-06, "loss": 0.81236112, "num_input_tokens_seen": 41587435, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.85546875, "step": 1921, "time_per_iteration": 2.4029366970062256 }, { "auxiliary_loss_clip": 0.01123317, "auxiliary_loss_mlp": 0.01051315, "balance_loss_clip": 1.02592325, "balance_loss_mlp": 1.03322721, "epoch": 0.11555689162783707, "flos": 19168049554560.0, "grad_norm": 1.9432903044582477, "language_loss": 0.78655696, "learning_rate": 3.869877851357789e-06, "loss": 0.80830324, "num_input_tokens_seen": 41604975, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.90234375, "step": 1922, "time_per_iteration": 2.364842414855957 }, { "auxiliary_loss_clip": 0.01125324, "auxiliary_loss_mlp": 0.01050104, "balance_loss_clip": 1.02689362, "balance_loss_mlp": 1.03513312, "epoch": 0.11561701488050503, "flos": 24570893629440.0, "grad_norm": 2.1217780923341003, "language_loss": 0.8439163, "learning_rate": 3.869743763348595e-06, "loss": 0.86567056, "num_input_tokens_seen": 41626155, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.90234375, "step": 1923, "time_per_iteration": 2.4334733486175537 }, { "auxiliary_loss_clip": 0.01122675, "auxiliary_loss_mlp": 0.01044974, "balance_loss_clip": 1.02002394, "balance_loss_mlp": 1.03377521, "epoch": 0.115677138133173, "flos": 17091590042880.0, "grad_norm": 2.242921719131463, "language_loss": 0.80798101, "learning_rate": 3.869609608613055e-06, "loss": 0.82965755, "num_input_tokens_seen": 41644805, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.890625, "step": 1924, "time_per_iteration": 2.3767452239990234 }, { "auxiliary_loss_clip": 0.01033338, "auxiliary_loss_mlp": 0.01004751, "balance_loss_clip": 1.00162804, "balance_loss_mlp": 1.0078938, "epoch": 0.11573726138584098, "flos": 62700515758080.0, "grad_norm": 0.8289051333358758, "language_loss": 0.61181498, "learning_rate": 3.869475387155958e-06, "loss": 0.63219583, "num_input_tokens_seen": 41709345, "router_z_loss_clip": 0.03125, "router_z_loss_mlp": 0.25390625, "step": 1925, "time_per_iteration": 3.051574230194092 }, { "auxiliary_loss_clip": 0.01118964, "auxiliary_loss_mlp": 0.01048068, "balance_loss_clip": 1.02341545, "balance_loss_mlp": 1.03163743, "epoch": 0.11579738463850894, "flos": 22600046580480.0, "grad_norm": 1.8870413846579273, "language_loss": 0.75285721, "learning_rate": 3.8693410989820925e-06, "loss": 0.77452743, "num_input_tokens_seen": 41730210, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.875, "step": 1926, "time_per_iteration": 2.398479461669922 }, { "auxiliary_loss_clip": 0.01120091, "auxiliary_loss_mlp": 0.0104481, "balance_loss_clip": 1.01912022, "balance_loss_mlp": 1.03248537, "epoch": 0.11585750789117691, "flos": 21725053286400.0, "grad_norm": 4.134645563731568, "language_loss": 0.72157353, "learning_rate": 3.869206744096252e-06, "loss": 0.74322253, "num_input_tokens_seen": 41750270, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.875, "step": 1927, "time_per_iteration": 2.406965494155884 }, { "auxiliary_loss_clip": 0.01117803, "auxiliary_loss_mlp": 0.0104096, "balance_loss_clip": 1.01684427, "balance_loss_mlp": 1.03162479, "epoch": 0.11591763114384489, "flos": 26286316104960.0, "grad_norm": 1.534695631001158, "language_loss": 0.86650527, "learning_rate": 3.869072322503232e-06, "loss": 0.88809288, "num_input_tokens_seen": 41772975, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.859375, "step": 1928, "time_per_iteration": 2.432307243347168 }, { "auxiliary_loss_clip": 0.01120868, "auxiliary_loss_mlp": 0.01047544, "balance_loss_clip": 1.02378583, "balance_loss_mlp": 1.03301144, "epoch": 0.11597775439651285, "flos": 22999418156160.0, "grad_norm": 1.7562016611887232, "language_loss": 0.77448833, "learning_rate": 3.868937834207828e-06, "loss": 0.79617244, "num_input_tokens_seen": 41791765, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.87890625, "step": 1929, "time_per_iteration": 2.4027504920959473 }, { "auxiliary_loss_clip": 0.01117126, "auxiliary_loss_mlp": 0.01050525, "balance_loss_clip": 1.02811384, "balance_loss_mlp": 1.0315845, "epoch": 0.11603787764918082, "flos": 31940360478720.0, "grad_norm": 2.9383215708820356, "language_loss": 0.76913202, "learning_rate": 3.86880327921484e-06, "loss": 0.79080844, "num_input_tokens_seen": 41815615, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.85546875, "step": 1930, "time_per_iteration": 2.471935510635376 }, { "auxiliary_loss_clip": 0.01119107, "auxiliary_loss_mlp": 0.01045321, "balance_loss_clip": 1.02261209, "balance_loss_mlp": 1.03244591, "epoch": 0.1160980009018488, "flos": 22270606191360.0, "grad_norm": 1.950474949962868, "language_loss": 0.72070694, "learning_rate": 3.8686686575290695e-06, "loss": 0.74235123, "num_input_tokens_seen": 41834810, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.8671875, "step": 1931, "time_per_iteration": 2.3999249935150146 }, { "auxiliary_loss_clip": 0.01123334, "auxiliary_loss_mlp": 0.01045584, "balance_loss_clip": 1.0216229, "balance_loss_mlp": 1.03538382, "epoch": 0.11615812415451676, "flos": 22782537590400.0, "grad_norm": 1.6694959414934365, "language_loss": 0.82114506, "learning_rate": 3.868533969155322e-06, "loss": 0.84283423, "num_input_tokens_seen": 41854975, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.8828125, "step": 1932, "time_per_iteration": 2.4132401943206787 }, { "auxiliary_loss_clip": 0.01029327, "auxiliary_loss_mlp": 0.01009468, "balance_loss_clip": 1.00627303, "balance_loss_mlp": 1.00398624, "epoch": 0.11621824740718473, "flos": 67142864885760.0, "grad_norm": 0.77813532920461, "language_loss": 0.61104012, "learning_rate": 3.868399214098404e-06, "loss": 0.631428, "num_input_tokens_seen": 41911105, "router_z_loss_clip": 0.03198242, "router_z_loss_mlp": 0.25390625, "step": 1933, "time_per_iteration": 2.89139986038208 }, { "auxiliary_loss_clip": 0.01121111, "auxiliary_loss_mlp": 0.01044965, "balance_loss_clip": 1.02174306, "balance_loss_mlp": 1.03182209, "epoch": 0.11627837065985269, "flos": 20374892121600.0, "grad_norm": 5.832653992836649, "language_loss": 0.85950387, "learning_rate": 3.868264392363124e-06, "loss": 0.88116461, "num_input_tokens_seen": 41931750, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.89453125, "step": 1934, "time_per_iteration": 2.4212839603424072 }, { "auxiliary_loss_clip": 0.01126041, "auxiliary_loss_mlp": 0.01047781, "balance_loss_clip": 1.02274728, "balance_loss_mlp": 1.03563976, "epoch": 0.11633849391252067, "flos": 21724739084160.0, "grad_norm": 2.285134865303062, "language_loss": 0.65957439, "learning_rate": 3.868129503954293e-06, "loss": 0.68131256, "num_input_tokens_seen": 41949400, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.90625, "step": 1935, "time_per_iteration": 2.410168409347534 }, { "auxiliary_loss_clip": 0.01124111, "auxiliary_loss_mlp": 0.01048578, "balance_loss_clip": 1.02572513, "balance_loss_mlp": 1.03325069, "epoch": 0.11639861716518864, "flos": 18804394166400.0, "grad_norm": 2.468005328842679, "language_loss": 0.75913846, "learning_rate": 3.867994548876726e-06, "loss": 0.78086537, "num_input_tokens_seen": 41968100, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.90625, "step": 1936, "time_per_iteration": 2.3786065578460693 }, { "auxiliary_loss_clip": 0.01123324, "auxiliary_loss_mlp": 0.01044361, "balance_loss_clip": 1.01954186, "balance_loss_mlp": 1.03270447, "epoch": 0.1164587404178566, "flos": 21213924848640.0, "grad_norm": 2.03835241920668, "language_loss": 0.8434478, "learning_rate": 3.867859527135238e-06, "loss": 0.86512464, "num_input_tokens_seen": 41986375, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.90625, "step": 1937, "time_per_iteration": 2.4055237770080566 }, { "auxiliary_loss_clip": 0.011168, "auxiliary_loss_mlp": 0.01036059, "balance_loss_clip": 1.01487517, "balance_loss_mlp": 1.0320996, "epoch": 0.11651886367052458, "flos": 27817397268480.0, "grad_norm": 1.9669668728451497, "language_loss": 0.76133978, "learning_rate": 3.867724438734649e-06, "loss": 0.78286839, "num_input_tokens_seen": 42006055, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.84765625, "step": 1938, "time_per_iteration": 2.444209575653076 }, { "auxiliary_loss_clip": 0.0112379, "auxiliary_loss_mlp": 0.01045809, "balance_loss_clip": 1.02169299, "balance_loss_mlp": 1.03271842, "epoch": 0.11657898692319255, "flos": 22888568989440.0, "grad_norm": 2.4333915561606583, "language_loss": 0.79423189, "learning_rate": 3.867589283679779e-06, "loss": 0.81592792, "num_input_tokens_seen": 42024995, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.91015625, "step": 1939, "time_per_iteration": 2.3905863761901855 }, { "auxiliary_loss_clip": 0.01120133, "auxiliary_loss_mlp": 0.01053959, "balance_loss_clip": 1.02989054, "balance_loss_mlp": 1.03123116, "epoch": 0.11663911017586051, "flos": 24314770828800.0, "grad_norm": 2.2356411317656377, "language_loss": 0.8636415, "learning_rate": 3.867454061975451e-06, "loss": 0.88538247, "num_input_tokens_seen": 42042640, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.88671875, "step": 1940, "time_per_iteration": 2.4451136589050293 }, { "auxiliary_loss_clip": 0.01118435, "auxiliary_loss_mlp": 0.01055251, "balance_loss_clip": 1.03233874, "balance_loss_mlp": 1.03401971, "epoch": 0.11669923342852849, "flos": 42338507794560.0, "grad_norm": 1.3780814995012212, "language_loss": 0.75742328, "learning_rate": 3.8673187736264914e-06, "loss": 0.77916014, "num_input_tokens_seen": 42067005, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.84375, "step": 1941, "time_per_iteration": 2.576957941055298 }, { "auxiliary_loss_clip": 0.01117836, "auxiliary_loss_mlp": 0.01047996, "balance_loss_clip": 1.02486932, "balance_loss_mlp": 1.0315516, "epoch": 0.11675935668119646, "flos": 14641560316800.0, "grad_norm": 2.077467512953499, "language_loss": 0.88486266, "learning_rate": 3.8671834186377275e-06, "loss": 0.90652096, "num_input_tokens_seen": 42082295, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.86328125, "step": 1942, "time_per_iteration": 2.3597772121429443 }, { "auxiliary_loss_clip": 0.01115714, "auxiliary_loss_mlp": 0.01043805, "balance_loss_clip": 1.02228785, "balance_loss_mlp": 1.03200805, "epoch": 0.11681947993386442, "flos": 35115012806400.0, "grad_norm": 1.6107241451107719, "language_loss": 0.68025339, "learning_rate": 3.867047997013991e-06, "loss": 0.70184863, "num_input_tokens_seen": 42105295, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.8359375, "step": 1943, "time_per_iteration": 2.5141799449920654 }, { "auxiliary_loss_clip": 0.01115098, "auxiliary_loss_mlp": 0.01037105, "balance_loss_clip": 1.01480138, "balance_loss_mlp": 1.03127599, "epoch": 0.11687960318653239, "flos": 38981713570560.0, "grad_norm": 3.075908187618805, "language_loss": 0.69172108, "learning_rate": 3.866912508760114e-06, "loss": 0.71324313, "num_input_tokens_seen": 42125520, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.8359375, "step": 1944, "time_per_iteration": 2.5394175052642822 }, { "auxiliary_loss_clip": 0.01116969, "auxiliary_loss_mlp": 0.01040783, "balance_loss_clip": 1.01936138, "balance_loss_mlp": 1.03080392, "epoch": 0.11693972643920036, "flos": 25993778889600.0, "grad_norm": 1.4261852213290416, "language_loss": 0.82534927, "learning_rate": 3.866776953880932e-06, "loss": 0.84692681, "num_input_tokens_seen": 42146335, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.86328125, "step": 1945, "time_per_iteration": 2.4165971279144287 }, { "auxiliary_loss_clip": 0.01115177, "auxiliary_loss_mlp": 0.01047136, "balance_loss_clip": 1.02493882, "balance_loss_mlp": 1.0304879, "epoch": 0.11699984969186833, "flos": 27270866845440.0, "grad_norm": 2.2188461994747657, "language_loss": 0.764691, "learning_rate": 3.8666413323812825e-06, "loss": 0.78631407, "num_input_tokens_seen": 42165320, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.84375, "step": 1946, "time_per_iteration": 2.4419751167297363 }, { "auxiliary_loss_clip": 0.01116491, "auxiliary_loss_mlp": 0.01048205, "balance_loss_clip": 1.02660465, "balance_loss_mlp": 1.03320909, "epoch": 0.1170599729445363, "flos": 15266959234560.0, "grad_norm": 1.852763228158811, "language_loss": 0.68523192, "learning_rate": 3.8665056442660055e-06, "loss": 0.7068789, "num_input_tokens_seen": 42182955, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.8359375, "step": 1947, "time_per_iteration": 2.3548645973205566 }, { "auxiliary_loss_clip": 0.01123669, "auxiliary_loss_mlp": 0.01047806, "balance_loss_clip": 1.02370167, "balance_loss_mlp": 1.03648901, "epoch": 0.11712009619720427, "flos": 17163511176960.0, "grad_norm": 2.2191004921610955, "language_loss": 0.84888136, "learning_rate": 3.866369889539942e-06, "loss": 0.87059611, "num_input_tokens_seen": 42200760, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.87109375, "step": 1948, "time_per_iteration": 2.3758552074432373 }, { "auxiliary_loss_clip": 0.01033309, "auxiliary_loss_mlp": 0.0101984, "balance_loss_clip": 1.01684785, "balance_loss_mlp": 1.0073961, "epoch": 0.11718021944987224, "flos": 70937644515840.0, "grad_norm": 0.8216537331260262, "language_loss": 0.65126908, "learning_rate": 3.86623406820794e-06, "loss": 0.67180055, "num_input_tokens_seen": 42265745, "router_z_loss_clip": 0.02990723, "router_z_loss_mlp": 0.25976562, "step": 1949, "time_per_iteration": 3.048769235610962 }, { "auxiliary_loss_clip": 0.01115862, "auxiliary_loss_mlp": 0.01048048, "balance_loss_clip": 1.02580369, "balance_loss_mlp": 1.03134167, "epoch": 0.1172403427025402, "flos": 27452240691840.0, "grad_norm": 1.6808494109986374, "language_loss": 0.71865463, "learning_rate": 3.8660981802748434e-06, "loss": 0.74029374, "num_input_tokens_seen": 42286245, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.84375, "step": 1950, "time_per_iteration": 2.432399034500122 }, { "auxiliary_loss_clip": 0.01123141, "auxiliary_loss_mlp": 0.01046762, "balance_loss_clip": 1.02400458, "balance_loss_mlp": 1.03402305, "epoch": 0.11730046595520818, "flos": 15667831998720.0, "grad_norm": 2.7210292700723393, "language_loss": 0.76711386, "learning_rate": 3.865962225745504e-06, "loss": 0.78881288, "num_input_tokens_seen": 42302710, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.890625, "step": 1951, "time_per_iteration": 3.768670082092285 }, { "auxiliary_loss_clip": 0.01120962, "auxiliary_loss_mlp": 0.0104838, "balance_loss_clip": 1.02534842, "balance_loss_mlp": 1.03474116, "epoch": 0.11736058920787615, "flos": 25628971426560.0, "grad_norm": 1.7319526892347994, "language_loss": 0.7685138, "learning_rate": 3.865826204624771e-06, "loss": 0.79020721, "num_input_tokens_seen": 42324115, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.86328125, "step": 1952, "time_per_iteration": 2.421802520751953 }, { "auxiliary_loss_clip": 0.01118202, "auxiliary_loss_mlp": 0.01047002, "balance_loss_clip": 1.02407813, "balance_loss_mlp": 1.03074563, "epoch": 0.11742071246054411, "flos": 21433214298240.0, "grad_norm": 1.7038080722742601, "language_loss": 0.71910661, "learning_rate": 3.865690116917501e-06, "loss": 0.74075866, "num_input_tokens_seen": 42342505, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.875, "step": 1953, "time_per_iteration": 3.782447338104248 }, { "auxiliary_loss_clip": 0.01124642, "auxiliary_loss_mlp": 0.0104304, "balance_loss_clip": 1.01981831, "balance_loss_mlp": 1.03440595, "epoch": 0.11748083571321208, "flos": 15996923274240.0, "grad_norm": 2.5555025588281386, "language_loss": 0.79637015, "learning_rate": 3.8655539626285505e-06, "loss": 0.81804705, "num_input_tokens_seen": 42360525, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.90234375, "step": 1954, "time_per_iteration": 3.778489351272583 }, { "auxiliary_loss_clip": 0.01119053, "auxiliary_loss_mlp": 0.01050862, "balance_loss_clip": 1.02672267, "balance_loss_mlp": 1.03155947, "epoch": 0.11754095896588006, "flos": 16179134993280.0, "grad_norm": 1.9247334416091033, "language_loss": 0.85399234, "learning_rate": 3.865417741762777e-06, "loss": 0.87569153, "num_input_tokens_seen": 42377045, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.875, "step": 1955, "time_per_iteration": 3.747711420059204 }, { "auxiliary_loss_clip": 0.01121077, "auxiliary_loss_mlp": 0.01046602, "balance_loss_clip": 1.02414286, "balance_loss_mlp": 1.03478241, "epoch": 0.11760108221854802, "flos": 13260745111680.0, "grad_norm": 2.2783008108075076, "language_loss": 0.77872068, "learning_rate": 3.865281454325043e-06, "loss": 0.80039746, "num_input_tokens_seen": 42393960, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.86328125, "step": 1956, "time_per_iteration": 2.368340492248535 }, { "auxiliary_loss_clip": 0.01116919, "auxiliary_loss_mlp": 0.01044495, "balance_loss_clip": 1.02103484, "balance_loss_mlp": 1.03262877, "epoch": 0.11766120547121599, "flos": 24497296750080.0, "grad_norm": 1.8959509243281567, "language_loss": 0.80642533, "learning_rate": 3.865145100320212e-06, "loss": 0.82803947, "num_input_tokens_seen": 42413160, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.84375, "step": 1957, "time_per_iteration": 2.4359662532806396 }, { "auxiliary_loss_clip": 0.01122664, "auxiliary_loss_mlp": 0.0103978, "balance_loss_clip": 1.01593804, "balance_loss_mlp": 1.03471184, "epoch": 0.11772132872388397, "flos": 17783079897600.0, "grad_norm": 3.407091579068367, "language_loss": 0.77597332, "learning_rate": 3.86500867975315e-06, "loss": 0.79759777, "num_input_tokens_seen": 42432590, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.87890625, "step": 1958, "time_per_iteration": 2.3869376182556152 }, { "auxiliary_loss_clip": 0.01118996, "auxiliary_loss_mlp": 0.01040341, "balance_loss_clip": 1.01607013, "balance_loss_mlp": 1.03330898, "epoch": 0.11778145197655193, "flos": 13216405818240.0, "grad_norm": 2.2072269949487886, "language_loss": 0.7668767, "learning_rate": 3.864872192628725e-06, "loss": 0.78847003, "num_input_tokens_seen": 42450135, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.85546875, "step": 1959, "time_per_iteration": 2.3841238021850586 }, { "auxiliary_loss_clip": 0.01122492, "auxiliary_loss_mlp": 0.01039689, "balance_loss_clip": 1.01795745, "balance_loss_mlp": 1.03416228, "epoch": 0.1178415752292199, "flos": 20229164640000.0, "grad_norm": 1.892203115995961, "language_loss": 0.69768929, "learning_rate": 3.864735638951809e-06, "loss": 0.71931112, "num_input_tokens_seen": 42470050, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.8828125, "step": 1960, "time_per_iteration": 2.4021859169006348 }, { "auxiliary_loss_clip": 0.01123859, "auxiliary_loss_mlp": 0.01043981, "balance_loss_clip": 1.02006721, "balance_loss_mlp": 1.03432, "epoch": 0.11790169848188788, "flos": 13039360980480.0, "grad_norm": 2.654020946733496, "language_loss": 0.81240052, "learning_rate": 3.864599018727275e-06, "loss": 0.83407891, "num_input_tokens_seen": 42484335, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.8984375, "step": 1961, "time_per_iteration": 2.367949962615967 }, { "auxiliary_loss_clip": 0.0111437, "auxiliary_loss_mlp": 0.01047871, "balance_loss_clip": 1.02488744, "balance_loss_mlp": 1.03202939, "epoch": 0.11796182173455584, "flos": 22264845816960.0, "grad_norm": 2.19753021704276, "language_loss": 0.92440534, "learning_rate": 3.864462331959998e-06, "loss": 0.94602782, "num_input_tokens_seen": 42502720, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.82421875, "step": 1962, "time_per_iteration": 2.4064676761627197 }, { "auxiliary_loss_clip": 0.01122337, "auxiliary_loss_mlp": 0.01049053, "balance_loss_clip": 1.02627194, "balance_loss_mlp": 1.03362584, "epoch": 0.1180219449872238, "flos": 10634229129600.0, "grad_norm": 2.1702189567270125, "language_loss": 0.871997, "learning_rate": 3.864325578654856e-06, "loss": 0.89371091, "num_input_tokens_seen": 42519460, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.88671875, "step": 1963, "time_per_iteration": 2.4086060523986816 }, { "auxiliary_loss_clip": 0.01118009, "auxiliary_loss_mlp": 0.01043094, "balance_loss_clip": 1.02043271, "balance_loss_mlp": 1.03014529, "epoch": 0.11808206823989177, "flos": 20922469885440.0, "grad_norm": 2.0896722423300678, "language_loss": 0.83948267, "learning_rate": 3.864188758816731e-06, "loss": 0.8610937, "num_input_tokens_seen": 42539420, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.87890625, "step": 1964, "time_per_iteration": 2.3962087631225586 }, { "auxiliary_loss_clip": 0.01124048, "auxiliary_loss_mlp": 0.01046546, "balance_loss_clip": 1.02208459, "balance_loss_mlp": 1.03625286, "epoch": 0.11814219149255975, "flos": 20776707492480.0, "grad_norm": 2.1223552877057097, "language_loss": 0.82847214, "learning_rate": 3.864051872450504e-06, "loss": 0.85017812, "num_input_tokens_seen": 42558225, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.875, "step": 1965, "time_per_iteration": 2.375262975692749 }, { "auxiliary_loss_clip": 0.01120428, "auxiliary_loss_mlp": 0.01043547, "balance_loss_clip": 1.01982474, "balance_loss_mlp": 1.03300381, "epoch": 0.11820231474522772, "flos": 48758162572800.0, "grad_norm": 1.6467709144383305, "language_loss": 0.74588215, "learning_rate": 3.863914919561059e-06, "loss": 0.76752186, "num_input_tokens_seen": 42580790, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.875, "step": 1966, "time_per_iteration": 2.6221964359283447 }, { "auxiliary_loss_clip": 0.01129223, "auxiliary_loss_mlp": 0.01048573, "balance_loss_clip": 1.02477837, "balance_loss_mlp": 1.03779209, "epoch": 0.11826243799789568, "flos": 16689669937920.0, "grad_norm": 2.8886502735577246, "language_loss": 0.72988284, "learning_rate": 3.863777900153287e-06, "loss": 0.75166082, "num_input_tokens_seen": 42597355, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.9140625, "step": 1967, "time_per_iteration": 2.3466272354125977 }, { "auxiliary_loss_clip": 0.01121168, "auxiliary_loss_mlp": 0.01043189, "balance_loss_clip": 1.01883435, "balance_loss_mlp": 1.03336382, "epoch": 0.11832256125056366, "flos": 16908924476160.0, "grad_norm": 2.087425676331709, "language_loss": 0.88269222, "learning_rate": 3.863640814232076e-06, "loss": 0.90433586, "num_input_tokens_seen": 42616060, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.875, "step": 1968, "time_per_iteration": 2.3878138065338135 }, { "auxiliary_loss_clip": 0.01120177, "auxiliary_loss_mlp": 0.01045114, "balance_loss_clip": 1.02167749, "balance_loss_mlp": 1.03433907, "epoch": 0.11838268450323162, "flos": 22819301118720.0, "grad_norm": 2.373016368325097, "language_loss": 0.67450416, "learning_rate": 3.863503661802317e-06, "loss": 0.6961571, "num_input_tokens_seen": 42636285, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.859375, "step": 1969, "time_per_iteration": 2.3945834636688232 }, { "auxiliary_loss_clip": 0.01122843, "auxiliary_loss_mlp": 0.01044308, "balance_loss_clip": 1.02043056, "balance_loss_mlp": 1.0355742, "epoch": 0.11844280775589959, "flos": 33544479939840.0, "grad_norm": 2.5095524727065324, "language_loss": 0.80832243, "learning_rate": 3.863366442868906e-06, "loss": 0.82999396, "num_input_tokens_seen": 42658320, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.875, "step": 1970, "time_per_iteration": 2.4908838272094727 }, { "auxiliary_loss_clip": 0.01033128, "auxiliary_loss_mlp": 0.01007311, "balance_loss_clip": 1.00461662, "balance_loss_mlp": 1.0066843, "epoch": 0.11850293100856757, "flos": 66347577959040.0, "grad_norm": 0.8002912839407599, "language_loss": 0.66149813, "learning_rate": 3.863229157436741e-06, "loss": 0.68190253, "num_input_tokens_seen": 42721500, "router_z_loss_clip": 0.02697754, "router_z_loss_mlp": 0.26367188, "step": 1971, "time_per_iteration": 3.002826452255249 }, { "auxiliary_loss_clip": 0.01120792, "auxiliary_loss_mlp": 0.01038308, "balance_loss_clip": 1.01590836, "balance_loss_mlp": 1.03313684, "epoch": 0.11856305426123553, "flos": 24679892494080.0, "grad_norm": 2.2289507829910598, "language_loss": 0.7991339, "learning_rate": 3.863091805510718e-06, "loss": 0.82072496, "num_input_tokens_seen": 42739825, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.875, "step": 1972, "time_per_iteration": 2.423044204711914 }, { "auxiliary_loss_clip": 0.01118167, "auxiliary_loss_mlp": 0.01045811, "balance_loss_clip": 1.02244556, "balance_loss_mlp": 1.03223205, "epoch": 0.1186231775139035, "flos": 24278949907200.0, "grad_norm": 2.1791190440773556, "language_loss": 0.72848439, "learning_rate": 3.862954387095743e-06, "loss": 0.75012422, "num_input_tokens_seen": 42758695, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.859375, "step": 1973, "time_per_iteration": 2.4122936725616455 }, { "auxiliary_loss_clip": 0.01117423, "auxiliary_loss_mlp": 0.01040538, "balance_loss_clip": 1.01789987, "balance_loss_mlp": 1.03277802, "epoch": 0.11868330076657148, "flos": 21756475376640.0, "grad_norm": 1.721549687915635, "language_loss": 0.71981263, "learning_rate": 3.862816902196717e-06, "loss": 0.74139225, "num_input_tokens_seen": 42778510, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.84375, "step": 1974, "time_per_iteration": 2.415842056274414 }, { "auxiliary_loss_clip": 0.01120913, "auxiliary_loss_mlp": 0.01044245, "balance_loss_clip": 1.02083254, "balance_loss_mlp": 1.03460026, "epoch": 0.11874342401923944, "flos": 17192559294720.0, "grad_norm": 2.1320444490964077, "language_loss": 0.78171802, "learning_rate": 3.862679350818547e-06, "loss": 0.80336952, "num_input_tokens_seen": 42793995, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.86328125, "step": 1975, "time_per_iteration": 2.3707268238067627 }, { "auxiliary_loss_clip": 0.01121493, "auxiliary_loss_mlp": 0.01041877, "balance_loss_clip": 1.02053881, "balance_loss_mlp": 1.03448355, "epoch": 0.11880354727190741, "flos": 15228729429120.0, "grad_norm": 5.821912162635384, "language_loss": 0.75312293, "learning_rate": 3.862541732966144e-06, "loss": 0.77475655, "num_input_tokens_seen": 42809000, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.8671875, "step": 1976, "time_per_iteration": 2.361881971359253 }, { "auxiliary_loss_clip": 0.01117401, "auxiliary_loss_mlp": 0.01043903, "balance_loss_clip": 1.02064562, "balance_loss_mlp": 1.03225017, "epoch": 0.11886367052457537, "flos": 27308433335040.0, "grad_norm": 3.8382577631191896, "language_loss": 0.75069487, "learning_rate": 3.862404048644416e-06, "loss": 0.77230787, "num_input_tokens_seen": 42831585, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.8515625, "step": 1977, "time_per_iteration": 2.4624083042144775 }, { "auxiliary_loss_clip": 0.01122901, "auxiliary_loss_mlp": 0.010488, "balance_loss_clip": 1.02482712, "balance_loss_mlp": 1.03556919, "epoch": 0.11892379377724335, "flos": 21797218800000.0, "grad_norm": 2.156976507215717, "language_loss": 0.7394048, "learning_rate": 3.862266297858279e-06, "loss": 0.76112187, "num_input_tokens_seen": 42848420, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.87109375, "step": 1978, "time_per_iteration": 2.385887622833252 }, { "auxiliary_loss_clip": 0.011179, "auxiliary_loss_mlp": 0.01046777, "balance_loss_clip": 1.02485406, "balance_loss_mlp": 1.03311896, "epoch": 0.11898391702991132, "flos": 13990150569600.0, "grad_norm": 1.908963691018837, "language_loss": 0.73343402, "learning_rate": 3.862128480612648e-06, "loss": 0.75508082, "num_input_tokens_seen": 42866645, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.84765625, "step": 1979, "time_per_iteration": 2.3576278686523438 }, { "auxiliary_loss_clip": 0.01120054, "auxiliary_loss_mlp": 0.01044867, "balance_loss_clip": 1.02189517, "balance_loss_mlp": 1.03369176, "epoch": 0.11904404028257928, "flos": 32233176984960.0, "grad_norm": 1.6278011430777886, "language_loss": 0.9859215, "learning_rate": 3.8619905969124415e-06, "loss": 1.00757062, "num_input_tokens_seen": 42888515, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.86328125, "step": 1980, "time_per_iteration": 2.4863715171813965 }, { "auxiliary_loss_clip": 0.01123417, "auxiliary_loss_mlp": 0.01049478, "balance_loss_clip": 1.02595758, "balance_loss_mlp": 1.03414297, "epoch": 0.11910416353524726, "flos": 23585155902720.0, "grad_norm": 1.7044605200764433, "language_loss": 0.8611837, "learning_rate": 3.86185264676258e-06, "loss": 0.88291258, "num_input_tokens_seen": 42909035, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.890625, "step": 1981, "time_per_iteration": 2.4147064685821533 }, { "auxiliary_loss_clip": 0.01123441, "auxiliary_loss_mlp": 0.01047793, "balance_loss_clip": 1.02433228, "balance_loss_mlp": 1.03463411, "epoch": 0.11916428678791523, "flos": 25332000468480.0, "grad_norm": 1.8984009700727715, "language_loss": 0.85393345, "learning_rate": 3.861714630167987e-06, "loss": 0.87564576, "num_input_tokens_seen": 42927555, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.88671875, "step": 1982, "time_per_iteration": 2.4262924194335938 }, { "auxiliary_loss_clip": 0.01118164, "auxiliary_loss_mlp": 0.01041158, "balance_loss_clip": 1.01779306, "balance_loss_mlp": 1.03233421, "epoch": 0.11922441004058319, "flos": 19787513541120.0, "grad_norm": 2.4797697769246785, "language_loss": 0.85202748, "learning_rate": 3.8615765471335874e-06, "loss": 0.87362069, "num_input_tokens_seen": 42945300, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.859375, "step": 1983, "time_per_iteration": 2.377220392227173 }, { "auxiliary_loss_clip": 0.01124226, "auxiliary_loss_mlp": 0.01050751, "balance_loss_clip": 1.02544284, "balance_loss_mlp": 1.03365731, "epoch": 0.11928453329325117, "flos": 21535475270400.0, "grad_norm": 3.3113596146379733, "language_loss": 0.77033579, "learning_rate": 3.8614383976643096e-06, "loss": 0.79208553, "num_input_tokens_seen": 42961295, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.90625, "step": 1984, "time_per_iteration": 2.3807835578918457 }, { "auxiliary_loss_clip": 0.0111945, "auxiliary_loss_mlp": 0.01055403, "balance_loss_clip": 1.03197885, "balance_loss_mlp": 1.03279757, "epoch": 0.11934465654591914, "flos": 20813924868480.0, "grad_norm": 1.8278780443494753, "language_loss": 0.83421803, "learning_rate": 3.861300181765084e-06, "loss": 0.85596657, "num_input_tokens_seen": 42980330, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.8671875, "step": 1985, "time_per_iteration": 2.380126714706421 }, { "auxiliary_loss_clip": 0.01116393, "auxiliary_loss_mlp": 0.01042754, "balance_loss_clip": 1.02002048, "balance_loss_mlp": 1.03147042, "epoch": 0.1194047797985871, "flos": 19059539448960.0, "grad_norm": 2.061105441215783, "language_loss": 0.73861659, "learning_rate": 3.861161899440843e-06, "loss": 0.76020807, "num_input_tokens_seen": 42996125, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.8515625, "step": 1986, "time_per_iteration": 2.3536202907562256 }, { "auxiliary_loss_clip": 0.01121507, "auxiliary_loss_mlp": 0.01049547, "balance_loss_clip": 1.02557421, "balance_loss_mlp": 1.03355265, "epoch": 0.11946490305125507, "flos": 27189798935040.0, "grad_norm": 1.9365010102679958, "language_loss": 0.720505, "learning_rate": 3.86102355069652e-06, "loss": 0.74221563, "num_input_tokens_seen": 43014180, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.87890625, "step": 1987, "time_per_iteration": 2.439728021621704 }, { "auxiliary_loss_clip": 0.0112178, "auxiliary_loss_mlp": 0.0104697, "balance_loss_clip": 1.02298498, "balance_loss_mlp": 1.03436017, "epoch": 0.11952502630392305, "flos": 21139769387520.0, "grad_norm": 2.47030658028113, "language_loss": 0.71941423, "learning_rate": 3.860885135537054e-06, "loss": 0.74110174, "num_input_tokens_seen": 43032120, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.875, "step": 1988, "time_per_iteration": 2.406344413757324 }, { "auxiliary_loss_clip": 0.01119401, "auxiliary_loss_mlp": 0.01054096, "balance_loss_clip": 1.02750063, "balance_loss_mlp": 1.03266072, "epoch": 0.11958514955659101, "flos": 22123237875840.0, "grad_norm": 1.9140912637348366, "language_loss": 0.80716503, "learning_rate": 3.860746653967384e-06, "loss": 0.82889998, "num_input_tokens_seen": 43052215, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 0.8671875, "step": 1989, "time_per_iteration": 2.383209705352783 }, { "auxiliary_loss_clip": 0.01124657, "auxiliary_loss_mlp": 0.01047651, "balance_loss_clip": 1.02223611, "balance_loss_mlp": 1.03449106, "epoch": 0.11964527280925898, "flos": 17420471850240.0, "grad_norm": 2.7618199204925213, "language_loss": 0.75409639, "learning_rate": 3.860608105992454e-06, "loss": 0.77581948, "num_input_tokens_seen": 43069720, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.90234375, "step": 1990, "time_per_iteration": 3.7754063606262207 }, { "auxiliary_loss_clip": 0.01033002, "auxiliary_loss_mlp": 0.01003007, "balance_loss_clip": 0.99988371, "balance_loss_mlp": 1.00634062, "epoch": 0.11970539606192696, "flos": 70676564302080.0, "grad_norm": 0.846775636511847, "language_loss": 0.55253577, "learning_rate": 3.860469491617206e-06, "loss": 0.57289588, "num_input_tokens_seen": 43123130, "router_z_loss_clip": 0.03125, "router_z_loss_mlp": 0.265625, "step": 1991, "time_per_iteration": 3.015073537826538 }, { "auxiliary_loss_clip": 0.01118721, "auxiliary_loss_mlp": 0.01043284, "balance_loss_clip": 1.02001476, "balance_loss_mlp": 1.03399217, "epoch": 0.11976551931459492, "flos": 21213959760000.0, "grad_norm": 6.539764829323169, "language_loss": 0.78014505, "learning_rate": 3.8603308108465864e-06, "loss": 0.80176508, "num_input_tokens_seen": 43140015, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.84765625, "step": 1992, "time_per_iteration": 3.727733612060547 }, { "auxiliary_loss_clip": 0.01123085, "auxiliary_loss_mlp": 0.01045995, "balance_loss_clip": 1.02146149, "balance_loss_mlp": 1.03341937, "epoch": 0.11982564256726289, "flos": 25988262894720.0, "grad_norm": 1.732685606188902, "language_loss": 0.79110837, "learning_rate": 3.8601920636855466e-06, "loss": 0.8127991, "num_input_tokens_seen": 43160105, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.8984375, "step": 1993, "time_per_iteration": 3.8382067680358887 }, { "auxiliary_loss_clip": 0.01117257, "auxiliary_loss_mlp": 0.01047155, "balance_loss_clip": 1.02362347, "balance_loss_mlp": 1.03113675, "epoch": 0.11988576581993086, "flos": 21649850484480.0, "grad_norm": 1.8914864370530282, "language_loss": 0.82625687, "learning_rate": 3.860053250139036e-06, "loss": 0.84790105, "num_input_tokens_seen": 43179835, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.859375, "step": 1994, "time_per_iteration": 2.388251781463623 }, { "auxiliary_loss_clip": 0.01119523, "auxiliary_loss_mlp": 0.01044532, "balance_loss_clip": 1.02286029, "balance_loss_mlp": 1.03431726, "epoch": 0.11994588907259883, "flos": 17856467308800.0, "grad_norm": 2.0853703367348304, "language_loss": 0.88376117, "learning_rate": 3.859914370212011e-06, "loss": 0.90540171, "num_input_tokens_seen": 43197210, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.8515625, "step": 1995, "time_per_iteration": 3.7453725337982178 }, { "auxiliary_loss_clip": 0.01120729, "auxiliary_loss_mlp": 0.01053838, "balance_loss_clip": 1.02916145, "balance_loss_mlp": 1.03449523, "epoch": 0.1200060123252668, "flos": 24461580562560.0, "grad_norm": 1.9456351794724802, "language_loss": 0.7399205, "learning_rate": 3.859775423909426e-06, "loss": 0.76166618, "num_input_tokens_seen": 43215050, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.86328125, "step": 1996, "time_per_iteration": 2.4598515033721924 }, { "auxiliary_loss_clip": 0.01118924, "auxiliary_loss_mlp": 0.01044023, "balance_loss_clip": 1.01909614, "balance_loss_mlp": 1.03307962, "epoch": 0.12006613557793476, "flos": 18731251134720.0, "grad_norm": 2.0778181248459413, "language_loss": 0.87980461, "learning_rate": 3.8596364112362395e-06, "loss": 0.90143406, "num_input_tokens_seen": 43233900, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.859375, "step": 1997, "time_per_iteration": 2.352095127105713 }, { "auxiliary_loss_clip": 0.01118619, "auxiliary_loss_mlp": 0.01053211, "balance_loss_clip": 1.02774751, "balance_loss_mlp": 1.03132403, "epoch": 0.12012625883060274, "flos": 22266800853120.0, "grad_norm": 2.0035983354663993, "language_loss": 0.78402185, "learning_rate": 3.859497332197413e-06, "loss": 0.80574012, "num_input_tokens_seen": 43252105, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.875, "step": 1998, "time_per_iteration": 2.411452054977417 }, { "auxiliary_loss_clip": 0.01122502, "auxiliary_loss_mlp": 0.01045654, "balance_loss_clip": 1.02150226, "balance_loss_mlp": 1.03462815, "epoch": 0.1201863820832707, "flos": 21757906742400.0, "grad_norm": 1.6482861458018172, "language_loss": 0.73282808, "learning_rate": 3.8593581867979105e-06, "loss": 0.75450969, "num_input_tokens_seen": 43270315, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.87890625, "step": 1999, "time_per_iteration": 2.4018945693969727 }, { "auxiliary_loss_clip": 0.01121568, "auxiliary_loss_mlp": 0.01050488, "balance_loss_clip": 1.02662206, "balance_loss_mlp": 1.03275871, "epoch": 0.12024650533593867, "flos": 21906915891840.0, "grad_norm": 2.312608842513132, "language_loss": 0.74748641, "learning_rate": 3.8592189750426965e-06, "loss": 0.769207, "num_input_tokens_seen": 43289935, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.88671875, "step": 2000, "time_per_iteration": 2.414414167404175 }, { "auxiliary_loss_clip": 0.01120195, "auxiliary_loss_mlp": 0.01045413, "balance_loss_clip": 1.02091551, "balance_loss_mlp": 1.0318017, "epoch": 0.12030662858860665, "flos": 21688150112640.0, "grad_norm": 1.5722378616385124, "language_loss": 0.84657854, "learning_rate": 3.85907969693674e-06, "loss": 0.86823463, "num_input_tokens_seen": 43309325, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.8828125, "step": 2001, "time_per_iteration": 2.3905575275421143 }, { "auxiliary_loss_clip": 0.01118935, "auxiliary_loss_mlp": 0.01042232, "balance_loss_clip": 1.01928473, "balance_loss_mlp": 1.03210068, "epoch": 0.12036675184127461, "flos": 12932386974720.0, "grad_norm": 2.150666824424472, "language_loss": 0.74219608, "learning_rate": 3.858940352485011e-06, "loss": 0.76380777, "num_input_tokens_seen": 43327010, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.8671875, "step": 2002, "time_per_iteration": 2.3468594551086426 }, { "auxiliary_loss_clip": 0.01125384, "auxiliary_loss_mlp": 0.01048133, "balance_loss_clip": 1.02219284, "balance_loss_mlp": 1.03465629, "epoch": 0.12042687509394258, "flos": 20849955258240.0, "grad_norm": 2.2416977745271627, "language_loss": 0.77901542, "learning_rate": 3.8588009416924835e-06, "loss": 0.80075049, "num_input_tokens_seen": 43345650, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.90625, "step": 2003, "time_per_iteration": 2.3935604095458984 }, { "auxiliary_loss_clip": 0.01118354, "auxiliary_loss_mlp": 0.01045372, "balance_loss_clip": 1.02026618, "balance_loss_mlp": 1.03217614, "epoch": 0.12048699834661056, "flos": 23877378915840.0, "grad_norm": 2.3116049897435924, "language_loss": 0.72234046, "learning_rate": 3.858661464564131e-06, "loss": 0.74397773, "num_input_tokens_seen": 43365555, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.86328125, "step": 2004, "time_per_iteration": 2.4032437801361084 }, { "auxiliary_loss_clip": 0.01127669, "auxiliary_loss_mlp": 0.01051112, "balance_loss_clip": 1.02428949, "balance_loss_mlp": 1.03524661, "epoch": 0.12054712159927852, "flos": 19755323400960.0, "grad_norm": 1.6450597877325683, "language_loss": 0.78438574, "learning_rate": 3.858521921104932e-06, "loss": 0.80617362, "num_input_tokens_seen": 43384990, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 0.921875, "step": 2005, "time_per_iteration": 2.4171624183654785 }, { "auxiliary_loss_clip": 0.01030416, "auxiliary_loss_mlp": 0.01006522, "balance_loss_clip": 1.00354171, "balance_loss_mlp": 1.00357127, "epoch": 0.12060724485194649, "flos": 51670057075200.0, "grad_norm": 0.9250736729463825, "language_loss": 0.58070427, "learning_rate": 3.858382311319866e-06, "loss": 0.60107362, "num_input_tokens_seen": 43436335, "router_z_loss_clip": 0.02978516, "router_z_loss_mlp": 0.26953125, "step": 2006, "time_per_iteration": 2.8138062953948975 }, { "auxiliary_loss_clip": 0.01120069, "auxiliary_loss_mlp": 0.01043962, "balance_loss_clip": 1.02029848, "balance_loss_mlp": 1.03448987, "epoch": 0.12066736810461445, "flos": 18989398794240.0, "grad_norm": 1.7877193335870534, "language_loss": 0.76783776, "learning_rate": 3.858242635213917e-06, "loss": 0.78947806, "num_input_tokens_seen": 43456495, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.85546875, "step": 2007, "time_per_iteration": 2.4160773754119873 }, { "auxiliary_loss_clip": 0.01121991, "auxiliary_loss_mlp": 0.0105572, "balance_loss_clip": 1.03088856, "balance_loss_mlp": 1.03364897, "epoch": 0.12072749135728243, "flos": 16471043804160.0, "grad_norm": 3.1240115634081933, "language_loss": 0.8271625, "learning_rate": 3.858102892792067e-06, "loss": 0.84893966, "num_input_tokens_seen": 43473085, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.8828125, "step": 2008, "time_per_iteration": 2.364374876022339 }, { "auxiliary_loss_clip": 0.01119849, "auxiliary_loss_mlp": 0.01046688, "balance_loss_clip": 1.02267885, "balance_loss_mlp": 1.03178072, "epoch": 0.1207876146099504, "flos": 18076140783360.0, "grad_norm": 2.175275464065516, "language_loss": 0.83321232, "learning_rate": 3.857963084059304e-06, "loss": 0.85487771, "num_input_tokens_seen": 43491135, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.8828125, "step": 2009, "time_per_iteration": 2.3697962760925293 }, { "auxiliary_loss_clip": 0.01123511, "auxiliary_loss_mlp": 0.01055106, "balance_loss_clip": 1.02812898, "balance_loss_mlp": 1.03362918, "epoch": 0.12084773786261836, "flos": 21870501477120.0, "grad_norm": 1.731425056093623, "language_loss": 0.84200156, "learning_rate": 3.857823209020619e-06, "loss": 0.86378777, "num_input_tokens_seen": 43510440, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.8984375, "step": 2010, "time_per_iteration": 2.382655382156372 }, { "auxiliary_loss_clip": 0.01125417, "auxiliary_loss_mlp": 0.01058759, "balance_loss_clip": 1.03335524, "balance_loss_mlp": 1.03640819, "epoch": 0.12090786111528634, "flos": 18332054115840.0, "grad_norm": 1.7003581849257905, "language_loss": 0.84254408, "learning_rate": 3.857683267681002e-06, "loss": 0.86438584, "num_input_tokens_seen": 43530145, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.890625, "step": 2011, "time_per_iteration": 2.4131879806518555 }, { "auxiliary_loss_clip": 0.01124281, "auxiliary_loss_mlp": 0.01052195, "balance_loss_clip": 1.02688694, "balance_loss_mlp": 1.03430021, "epoch": 0.1209679843679543, "flos": 21104786338560.0, "grad_norm": 1.8372059440576718, "language_loss": 0.95579314, "learning_rate": 3.857543260045448e-06, "loss": 0.9775579, "num_input_tokens_seen": 43549315, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.8984375, "step": 2012, "time_per_iteration": 2.3958005905151367 }, { "auxiliary_loss_clip": 0.01119601, "auxiliary_loss_mlp": 0.0104187, "balance_loss_clip": 1.01671648, "balance_loss_mlp": 1.03354287, "epoch": 0.12102810762062227, "flos": 29239793769600.0, "grad_norm": 2.677766300536327, "language_loss": 0.80141032, "learning_rate": 3.857403186118952e-06, "loss": 0.82302499, "num_input_tokens_seen": 43569240, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.859375, "step": 2013, "time_per_iteration": 2.4480528831481934 }, { "auxiliary_loss_clip": 0.01123554, "auxiliary_loss_mlp": 0.01051356, "balance_loss_clip": 1.02354503, "balance_loss_mlp": 1.03343034, "epoch": 0.12108823087329025, "flos": 17929749985920.0, "grad_norm": 2.5308944401753597, "language_loss": 0.77227497, "learning_rate": 3.857263045906516e-06, "loss": 0.79402405, "num_input_tokens_seen": 43587710, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 0.8984375, "step": 2014, "time_per_iteration": 2.358426809310913 }, { "auxiliary_loss_clip": 0.01121764, "auxiliary_loss_mlp": 0.010419, "balance_loss_clip": 1.01609063, "balance_loss_mlp": 1.03438997, "epoch": 0.12114835412595822, "flos": 22090733533440.0, "grad_norm": 3.5941496769831156, "language_loss": 0.86573106, "learning_rate": 3.857122839413138e-06, "loss": 0.88736767, "num_input_tokens_seen": 43606000, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.875, "step": 2015, "time_per_iteration": 2.4904682636260986 }, { "auxiliary_loss_clip": 0.01116226, "auxiliary_loss_mlp": 0.0105065, "balance_loss_clip": 1.02637863, "balance_loss_mlp": 1.03096962, "epoch": 0.12120847737862618, "flos": 20411306536320.0, "grad_norm": 2.4791579545711127, "language_loss": 0.68878114, "learning_rate": 3.856982566643824e-06, "loss": 0.71044993, "num_input_tokens_seen": 43624815, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.8515625, "step": 2016, "time_per_iteration": 2.3910837173461914 }, { "auxiliary_loss_clip": 0.01125592, "auxiliary_loss_mlp": 0.0104802, "balance_loss_clip": 1.02233076, "balance_loss_mlp": 1.03680539, "epoch": 0.12126860063129415, "flos": 22307963212800.0, "grad_norm": 5.124671887293178, "language_loss": 0.80184972, "learning_rate": 3.856842227603578e-06, "loss": 0.82358587, "num_input_tokens_seen": 43643960, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.88671875, "step": 2017, "time_per_iteration": 2.395059585571289 }, { "auxiliary_loss_clip": 0.01122697, "auxiliary_loss_mlp": 0.01047158, "balance_loss_clip": 1.01989436, "balance_loss_mlp": 1.03370953, "epoch": 0.12132872388396213, "flos": 13698416315520.0, "grad_norm": 2.248275907000715, "language_loss": 0.68856907, "learning_rate": 3.856701822297409e-06, "loss": 0.71026766, "num_input_tokens_seen": 43662650, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 0.890625, "step": 2018, "time_per_iteration": 2.3690297603607178 }, { "auxiliary_loss_clip": 0.01126354, "auxiliary_loss_mlp": 0.0104908, "balance_loss_clip": 1.02452278, "balance_loss_mlp": 1.03770387, "epoch": 0.12138884713663009, "flos": 26465804737920.0, "grad_norm": 1.794465734915278, "language_loss": 0.72320479, "learning_rate": 3.856561350730329e-06, "loss": 0.74495912, "num_input_tokens_seen": 43684205, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.88671875, "step": 2019, "time_per_iteration": 2.4442644119262695 }, { "auxiliary_loss_clip": 0.01121581, "auxiliary_loss_mlp": 0.01058645, "balance_loss_clip": 1.03263319, "balance_loss_mlp": 1.03215969, "epoch": 0.12144897038929806, "flos": 26140379155200.0, "grad_norm": 2.9071528755660077, "language_loss": 0.92150027, "learning_rate": 3.856420812907349e-06, "loss": 0.94330251, "num_input_tokens_seen": 43706320, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.89453125, "step": 2020, "time_per_iteration": 2.4346835613250732 }, { "auxiliary_loss_clip": 0.01122444, "auxiliary_loss_mlp": 0.01047896, "balance_loss_clip": 1.02292204, "balance_loss_mlp": 1.03476238, "epoch": 0.12150909364196603, "flos": 24716376731520.0, "grad_norm": 2.000040683910714, "language_loss": 0.7741518, "learning_rate": 3.856280208833486e-06, "loss": 0.79585522, "num_input_tokens_seen": 43724805, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.875, "step": 2021, "time_per_iteration": 2.417357921600342 }, { "auxiliary_loss_clip": 0.01120585, "auxiliary_loss_mlp": 0.01046029, "balance_loss_clip": 1.02217555, "balance_loss_mlp": 1.03421116, "epoch": 0.121569216894634, "flos": 25185958784640.0, "grad_norm": 2.02603328388594, "language_loss": 0.80683607, "learning_rate": 3.856139538513758e-06, "loss": 0.82850218, "num_input_tokens_seen": 43742320, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.86328125, "step": 2022, "time_per_iteration": 2.413191080093384 }, { "auxiliary_loss_clip": 0.01124483, "auxiliary_loss_mlp": 0.01053167, "balance_loss_clip": 1.02814507, "balance_loss_mlp": 1.03584802, "epoch": 0.12162934014730196, "flos": 13443236121600.0, "grad_norm": 1.7818911963633228, "language_loss": 0.85147119, "learning_rate": 3.855998801953183e-06, "loss": 0.87324774, "num_input_tokens_seen": 43760665, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.88671875, "step": 2023, "time_per_iteration": 2.382678985595703 }, { "auxiliary_loss_clip": 0.01120644, "auxiliary_loss_mlp": 0.01050331, "balance_loss_clip": 1.02470064, "balance_loss_mlp": 1.03305507, "epoch": 0.12168946339996994, "flos": 16945199245440.0, "grad_norm": 2.298953223732629, "language_loss": 0.85245049, "learning_rate": 3.855857999156786e-06, "loss": 0.87416029, "num_input_tokens_seen": 43779020, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.875, "step": 2024, "time_per_iteration": 2.3559911251068115 }, { "auxiliary_loss_clip": 0.01119247, "auxiliary_loss_mlp": 0.01047792, "balance_loss_clip": 1.02230477, "balance_loss_mlp": 1.03056741, "epoch": 0.12174958665263791, "flos": 29820399546240.0, "grad_norm": 2.6608086756553595, "language_loss": 0.71909428, "learning_rate": 3.85571713012959e-06, "loss": 0.74076468, "num_input_tokens_seen": 43798850, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.890625, "step": 2025, "time_per_iteration": 2.4525861740112305 }, { "auxiliary_loss_clip": 0.01122017, "auxiliary_loss_mlp": 0.01044699, "balance_loss_clip": 1.02079761, "balance_loss_mlp": 1.03384447, "epoch": 0.12180970990530587, "flos": 24640824816000.0, "grad_norm": 1.9378737122767655, "language_loss": 0.76372939, "learning_rate": 3.855576194876624e-06, "loss": 0.78539658, "num_input_tokens_seen": 43820130, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.8828125, "step": 2026, "time_per_iteration": 2.443085193634033 }, { "auxiliary_loss_clip": 0.01121749, "auxiliary_loss_mlp": 0.01048304, "balance_loss_clip": 1.02411687, "balance_loss_mlp": 1.03315139, "epoch": 0.12186983315797385, "flos": 20520654514560.0, "grad_norm": 2.396102644901252, "language_loss": 0.88871133, "learning_rate": 3.855435193402916e-06, "loss": 0.91041183, "num_input_tokens_seen": 43838485, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.88671875, "step": 2027, "time_per_iteration": 2.375792980194092 }, { "auxiliary_loss_clip": 0.01119273, "auxiliary_loss_mlp": 0.0104507, "balance_loss_clip": 1.02220595, "balance_loss_mlp": 1.03226328, "epoch": 0.12192995641064182, "flos": 27817117977600.0, "grad_norm": 1.5858442001493853, "language_loss": 0.7563501, "learning_rate": 3.8552941257135e-06, "loss": 0.77799356, "num_input_tokens_seen": 43859080, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.8671875, "step": 2028, "time_per_iteration": 2.433760404586792 }, { "auxiliary_loss_clip": 0.0111943, "auxiliary_loss_mlp": 0.01051931, "balance_loss_clip": 1.02595568, "balance_loss_mlp": 1.03203964, "epoch": 0.12199007966330978, "flos": 22016054401920.0, "grad_norm": 2.2405902070927253, "language_loss": 0.7657541, "learning_rate": 3.855152991813408e-06, "loss": 0.78746778, "num_input_tokens_seen": 43879030, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.875, "step": 2029, "time_per_iteration": 3.77600359916687 }, { "auxiliary_loss_clip": 0.01118043, "auxiliary_loss_mlp": 0.0104532, "balance_loss_clip": 1.02150249, "balance_loss_mlp": 1.03121793, "epoch": 0.12205020291597775, "flos": 23294084964480.0, "grad_norm": 3.673993069551172, "language_loss": 0.7888177, "learning_rate": 3.855011791707678e-06, "loss": 0.81045127, "num_input_tokens_seen": 43898505, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.8671875, "step": 2030, "time_per_iteration": 2.3976783752441406 }, { "auxiliary_loss_clip": 0.01117256, "auxiliary_loss_mlp": 0.01049557, "balance_loss_clip": 1.0247736, "balance_loss_mlp": 1.03186178, "epoch": 0.12211032616864573, "flos": 26030402772480.0, "grad_norm": 2.072864758974824, "language_loss": 0.73834264, "learning_rate": 3.854870525401349e-06, "loss": 0.76001072, "num_input_tokens_seen": 43917945, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.8515625, "step": 2031, "time_per_iteration": 2.418802261352539 }, { "auxiliary_loss_clip": 0.01118716, "auxiliary_loss_mlp": 0.01046682, "balance_loss_clip": 1.02255392, "balance_loss_mlp": 1.03258789, "epoch": 0.12217044942131369, "flos": 20409944993280.0, "grad_norm": 4.745944251459415, "language_loss": 0.74995601, "learning_rate": 3.8547291928994615e-06, "loss": 0.77161002, "num_input_tokens_seen": 43937385, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.859375, "step": 2032, "time_per_iteration": 5.171973705291748 }, { "auxiliary_loss_clip": 0.01111557, "auxiliary_loss_mlp": 0.01039076, "balance_loss_clip": 1.01628351, "balance_loss_mlp": 1.02953041, "epoch": 0.12223057267398166, "flos": 22856029735680.0, "grad_norm": 1.6312332353564754, "language_loss": 0.89163828, "learning_rate": 3.8545877942070605e-06, "loss": 0.91314459, "num_input_tokens_seen": 43958130, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.8203125, "step": 2033, "time_per_iteration": 2.4246609210968018 }, { "auxiliary_loss_clip": 0.01124556, "auxiliary_loss_mlp": 0.0104608, "balance_loss_clip": 1.02266693, "balance_loss_mlp": 1.03622246, "epoch": 0.12229069592664964, "flos": 20046533984640.0, "grad_norm": 1.9397284321498525, "language_loss": 0.65490395, "learning_rate": 3.8544463293291914e-06, "loss": 0.67661023, "num_input_tokens_seen": 43976800, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.8828125, "step": 2034, "time_per_iteration": 3.7740495204925537 }, { "auxiliary_loss_clip": 0.011211, "auxiliary_loss_mlp": 0.01050813, "balance_loss_clip": 1.02624369, "balance_loss_mlp": 1.03414273, "epoch": 0.1223508191793176, "flos": 22273119809280.0, "grad_norm": 2.121930359580294, "language_loss": 0.76366186, "learning_rate": 3.8543047982709035e-06, "loss": 0.78538096, "num_input_tokens_seen": 43996620, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.8671875, "step": 2035, "time_per_iteration": 2.409691333770752 }, { "auxiliary_loss_clip": 0.01122696, "auxiliary_loss_mlp": 0.01048104, "balance_loss_clip": 1.02359462, "balance_loss_mlp": 1.03383136, "epoch": 0.12241094243198557, "flos": 21284973198720.0, "grad_norm": 1.832738408115759, "language_loss": 0.71510398, "learning_rate": 3.854163201037247e-06, "loss": 0.73681188, "num_input_tokens_seen": 44016175, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.890625, "step": 2036, "time_per_iteration": 2.399597406387329 }, { "auxiliary_loss_clip": 0.01120529, "auxiliary_loss_mlp": 0.01052217, "balance_loss_clip": 1.02780282, "balance_loss_mlp": 1.03389144, "epoch": 0.12247106568465355, "flos": 17381473994880.0, "grad_norm": 1.7031477508166286, "language_loss": 0.83004296, "learning_rate": 3.854021537633275e-06, "loss": 0.8517704, "num_input_tokens_seen": 44035060, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.8671875, "step": 2037, "time_per_iteration": 2.3796517848968506 }, { "auxiliary_loss_clip": 0.01126316, "auxiliary_loss_mlp": 0.01045922, "balance_loss_clip": 1.0206852, "balance_loss_mlp": 1.03652191, "epoch": 0.12253118893732151, "flos": 27044420567040.0, "grad_norm": 3.2035945862169948, "language_loss": 0.79517901, "learning_rate": 3.853879808064044e-06, "loss": 0.81690133, "num_input_tokens_seen": 44053330, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.8984375, "step": 2038, "time_per_iteration": 2.4297780990600586 }, { "auxiliary_loss_clip": 0.01033306, "auxiliary_loss_mlp": 0.01017511, "balance_loss_clip": 1.01360095, "balance_loss_mlp": 1.00665402, "epoch": 0.12259131218998948, "flos": 53858762208000.0, "grad_norm": 0.8245276463797878, "language_loss": 0.58636552, "learning_rate": 3.8537380123346105e-06, "loss": 0.60687369, "num_input_tokens_seen": 44107575, "router_z_loss_clip": 0.0390625, "router_z_loss_mlp": 0.265625, "step": 2039, "time_per_iteration": 2.911100149154663 }, { "auxiliary_loss_clip": 0.0112325, "auxiliary_loss_mlp": 0.01049507, "balance_loss_clip": 1.02356696, "balance_loss_mlp": 1.03539979, "epoch": 0.12265143544265744, "flos": 17891031421440.0, "grad_norm": 3.018760004112527, "language_loss": 0.80326319, "learning_rate": 3.853596150450037e-06, "loss": 0.82499075, "num_input_tokens_seen": 44126075, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.87890625, "step": 2040, "time_per_iteration": 2.3550355434417725 }, { "auxiliary_loss_clip": 0.01116468, "auxiliary_loss_mlp": 0.01044236, "balance_loss_clip": 1.02149129, "balance_loss_mlp": 1.03211033, "epoch": 0.12271155869532542, "flos": 21798824722560.0, "grad_norm": 1.7984327608047004, "language_loss": 0.82874405, "learning_rate": 3.853454222415384e-06, "loss": 0.8503511, "num_input_tokens_seen": 44145605, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.84375, "step": 2041, "time_per_iteration": 2.387716054916382 }, { "auxiliary_loss_clip": 0.01122322, "auxiliary_loss_mlp": 0.01042929, "balance_loss_clip": 1.01605964, "balance_loss_mlp": 1.03212059, "epoch": 0.12277168194799339, "flos": 19827733294080.0, "grad_norm": 1.7672051121677157, "language_loss": 0.67215192, "learning_rate": 3.853312228235717e-06, "loss": 0.69380438, "num_input_tokens_seen": 44164770, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.90234375, "step": 2042, "time_per_iteration": 2.378965139389038 }, { "auxiliary_loss_clip": 0.01120555, "auxiliary_loss_mlp": 0.01055578, "balance_loss_clip": 1.03185534, "balance_loss_mlp": 1.0333581, "epoch": 0.12283180520066135, "flos": 23219929503360.0, "grad_norm": 1.8628373231360564, "language_loss": 0.81608152, "learning_rate": 3.853170167916106e-06, "loss": 0.83784282, "num_input_tokens_seen": 44184025, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.87109375, "step": 2043, "time_per_iteration": 2.3813624382019043 }, { "auxiliary_loss_clip": 0.0111882, "auxiliary_loss_mlp": 0.01047401, "balance_loss_clip": 1.02171087, "balance_loss_mlp": 1.03120828, "epoch": 0.12289192845332933, "flos": 18587478689280.0, "grad_norm": 1.907887097537687, "language_loss": 0.80282354, "learning_rate": 3.853028041461617e-06, "loss": 0.82448578, "num_input_tokens_seen": 44202950, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 0.875, "step": 2044, "time_per_iteration": 2.379453420639038 }, { "auxiliary_loss_clip": 0.01118916, "auxiliary_loss_mlp": 0.01046854, "balance_loss_clip": 1.02279735, "balance_loss_mlp": 1.03514624, "epoch": 0.1229520517059973, "flos": 25768519597440.0, "grad_norm": 1.6706588656358827, "language_loss": 0.78307921, "learning_rate": 3.852885848877323e-06, "loss": 0.80473691, "num_input_tokens_seen": 44221115, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.8359375, "step": 2045, "time_per_iteration": 2.4202218055725098 }, { "auxiliary_loss_clip": 0.01124099, "auxiliary_loss_mlp": 0.0105316, "balance_loss_clip": 1.02633786, "balance_loss_mlp": 1.03491473, "epoch": 0.12301217495866526, "flos": 20886090382080.0, "grad_norm": 2.1187738228664235, "language_loss": 0.67233276, "learning_rate": 3.852743590168301e-06, "loss": 0.69410533, "num_input_tokens_seen": 44240575, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 0.890625, "step": 2046, "time_per_iteration": 2.3931007385253906 }, { "auxiliary_loss_clip": 0.01118281, "auxiliary_loss_mlp": 0.01052155, "balance_loss_clip": 1.02762175, "balance_loss_mlp": 1.03496456, "epoch": 0.12307229821133324, "flos": 22377824576640.0, "grad_norm": 2.1682406600206012, "language_loss": 0.72872901, "learning_rate": 3.852601265339625e-06, "loss": 0.75043344, "num_input_tokens_seen": 44257145, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.83203125, "step": 2047, "time_per_iteration": 2.3804879188537598 }, { "auxiliary_loss_clip": 0.01118315, "auxiliary_loss_mlp": 0.01044566, "balance_loss_clip": 1.01955628, "balance_loss_mlp": 1.03371596, "epoch": 0.1231324214640012, "flos": 23366285389440.0, "grad_norm": 1.6043465109701915, "language_loss": 0.76935506, "learning_rate": 3.8524588743963755e-06, "loss": 0.7909838, "num_input_tokens_seen": 44278035, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.84375, "step": 2048, "time_per_iteration": 2.4538443088531494 }, { "auxiliary_loss_clip": 0.01120431, "auxiliary_loss_mlp": 0.01044889, "balance_loss_clip": 1.02201271, "balance_loss_mlp": 1.03345227, "epoch": 0.12319254471666917, "flos": 23766075901440.0, "grad_norm": 1.8330515449971934, "language_loss": 0.84730721, "learning_rate": 3.852316417343634e-06, "loss": 0.86896044, "num_input_tokens_seen": 44296980, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.8671875, "step": 2049, "time_per_iteration": 2.402829885482788 }, { "auxiliary_loss_clip": 0.01118217, "auxiliary_loss_mlp": 0.01050867, "balance_loss_clip": 1.02539206, "balance_loss_mlp": 1.03130078, "epoch": 0.12325266796933713, "flos": 23549020778880.0, "grad_norm": 2.288036285795224, "language_loss": 0.75656784, "learning_rate": 3.852173894186484e-06, "loss": 0.77825868, "num_input_tokens_seen": 44318005, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.8671875, "step": 2050, "time_per_iteration": 2.4450416564941406 }, { "auxiliary_loss_clip": 0.01119152, "auxiliary_loss_mlp": 0.01044909, "balance_loss_clip": 1.02019715, "balance_loss_mlp": 1.03360677, "epoch": 0.12331279122200511, "flos": 24422896909440.0, "grad_norm": 2.177058190261101, "language_loss": 0.80784744, "learning_rate": 3.852031304930012e-06, "loss": 0.82948804, "num_input_tokens_seen": 44335260, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.8515625, "step": 2051, "time_per_iteration": 2.494203805923462 }, { "auxiliary_loss_clip": 0.01118979, "auxiliary_loss_mlp": 0.01048398, "balance_loss_clip": 1.02320886, "balance_loss_mlp": 1.0356648, "epoch": 0.12337291447467308, "flos": 25483104299520.0, "grad_norm": 1.7588340293504667, "language_loss": 0.80011177, "learning_rate": 3.851888649579307e-06, "loss": 0.82178557, "num_input_tokens_seen": 44355315, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.83203125, "step": 2052, "time_per_iteration": 2.4480302333831787 }, { "auxiliary_loss_clip": 0.01119589, "auxiliary_loss_mlp": 0.01049361, "balance_loss_clip": 1.02334929, "balance_loss_mlp": 1.03246593, "epoch": 0.12343303772734104, "flos": 23548881133440.0, "grad_norm": 2.062989969890706, "language_loss": 0.7362048, "learning_rate": 3.85174592813946e-06, "loss": 0.75789428, "num_input_tokens_seen": 44373020, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.87109375, "step": 2053, "time_per_iteration": 2.395052909851074 }, { "auxiliary_loss_clip": 0.01117386, "auxiliary_loss_mlp": 0.01044747, "balance_loss_clip": 1.02097631, "balance_loss_mlp": 1.02949238, "epoch": 0.12349316098000902, "flos": 47555299900800.0, "grad_norm": 1.7078781870035997, "language_loss": 0.74674809, "learning_rate": 3.851603140615564e-06, "loss": 0.76836938, "num_input_tokens_seen": 44397525, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.87890625, "step": 2054, "time_per_iteration": 2.6298487186431885 }, { "auxiliary_loss_clip": 0.0111469, "auxiliary_loss_mlp": 0.01037518, "balance_loss_clip": 1.01553583, "balance_loss_mlp": 1.02974188, "epoch": 0.12355328423267699, "flos": 25044804691200.0, "grad_norm": 2.2983846207671887, "language_loss": 0.84969324, "learning_rate": 3.851460287012714e-06, "loss": 0.87121534, "num_input_tokens_seen": 44415890, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.84765625, "step": 2055, "time_per_iteration": 2.4238319396972656 }, { "auxiliary_loss_clip": 0.01117869, "auxiliary_loss_mlp": 0.01047669, "balance_loss_clip": 1.02550793, "balance_loss_mlp": 1.03191376, "epoch": 0.12361340748534495, "flos": 27707909644800.0, "grad_norm": 2.378386507328866, "language_loss": 0.77205324, "learning_rate": 3.85131736733601e-06, "loss": 0.79370862, "num_input_tokens_seen": 44436625, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.859375, "step": 2056, "time_per_iteration": 2.454680919647217 }, { "auxiliary_loss_clip": 0.0111708, "auxiliary_loss_mlp": 0.01043758, "balance_loss_clip": 1.0191288, "balance_loss_mlp": 1.03185117, "epoch": 0.12367353073801293, "flos": 26139401637120.0, "grad_norm": 2.3364790556535238, "language_loss": 0.83136255, "learning_rate": 3.851174381590551e-06, "loss": 0.85297096, "num_input_tokens_seen": 44455265, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.8515625, "step": 2057, "time_per_iteration": 2.4133453369140625 }, { "auxiliary_loss_clip": 0.01122137, "auxiliary_loss_mlp": 0.0104733, "balance_loss_clip": 1.02342844, "balance_loss_mlp": 1.03349555, "epoch": 0.1237336539906809, "flos": 25154850896640.0, "grad_norm": 1.7560536613187636, "language_loss": 0.78054428, "learning_rate": 3.85103132978144e-06, "loss": 0.802239, "num_input_tokens_seen": 44475815, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.88671875, "step": 2058, "time_per_iteration": 2.4437592029571533 }, { "auxiliary_loss_clip": 0.01117888, "auxiliary_loss_mlp": 0.0104837, "balance_loss_clip": 1.02399194, "balance_loss_mlp": 1.03020191, "epoch": 0.12379377724334886, "flos": 15303687851520.0, "grad_norm": 2.113776375453416, "language_loss": 0.83108556, "learning_rate": 3.850888211913782e-06, "loss": 0.85274816, "num_input_tokens_seen": 44494045, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.87890625, "step": 2059, "time_per_iteration": 2.3637967109680176 }, { "auxiliary_loss_clip": 0.01122157, "auxiliary_loss_mlp": 0.0105333, "balance_loss_clip": 1.02692533, "balance_loss_mlp": 1.03408313, "epoch": 0.12385390049601683, "flos": 21315871618560.0, "grad_norm": 2.294771563660056, "language_loss": 0.8141284, "learning_rate": 3.8507450279926856e-06, "loss": 0.83588326, "num_input_tokens_seen": 44509120, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.8828125, "step": 2060, "time_per_iteration": 2.4067494869232178 }, { "auxiliary_loss_clip": 0.01115806, "auxiliary_loss_mlp": 0.01048251, "balance_loss_clip": 1.02343154, "balance_loss_mlp": 1.03006387, "epoch": 0.1239140237486848, "flos": 15115576112640.0, "grad_norm": 2.2533304529817393, "language_loss": 0.85985982, "learning_rate": 3.850601778023259e-06, "loss": 0.88150042, "num_input_tokens_seen": 44525780, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.859375, "step": 2061, "time_per_iteration": 2.3487155437469482 }, { "auxiliary_loss_clip": 0.01117927, "auxiliary_loss_mlp": 0.01044717, "balance_loss_clip": 1.02045834, "balance_loss_mlp": 1.03339148, "epoch": 0.12397414700135277, "flos": 21975834648960.0, "grad_norm": 1.807106900031189, "language_loss": 0.84427786, "learning_rate": 3.850458462010615e-06, "loss": 0.86590421, "num_input_tokens_seen": 44543125, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.84375, "step": 2062, "time_per_iteration": 2.4029412269592285 }, { "auxiliary_loss_clip": 0.0111791, "auxiliary_loss_mlp": 0.01050326, "balance_loss_clip": 1.02610242, "balance_loss_mlp": 1.03355742, "epoch": 0.12403427025402074, "flos": 13400223459840.0, "grad_norm": 1.7333092179216898, "language_loss": 0.78806698, "learning_rate": 3.850315079959869e-06, "loss": 0.80974936, "num_input_tokens_seen": 44560275, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.84375, "step": 2063, "time_per_iteration": 2.377467393875122 }, { "auxiliary_loss_clip": 0.01117095, "auxiliary_loss_mlp": 0.01046062, "balance_loss_clip": 1.02002692, "balance_loss_mlp": 1.03245807, "epoch": 0.12409439350668872, "flos": 15303478383360.0, "grad_norm": 2.240657909027672, "language_loss": 0.79231298, "learning_rate": 3.850171631876137e-06, "loss": 0.81394458, "num_input_tokens_seen": 44577640, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 0.84375, "step": 2064, "time_per_iteration": 2.3639748096466064 }, { "auxiliary_loss_clip": 0.01116078, "auxiliary_loss_mlp": 0.01052179, "balance_loss_clip": 1.02820563, "balance_loss_mlp": 1.03158522, "epoch": 0.12415451675935668, "flos": 25008215719680.0, "grad_norm": 3.4474827998857696, "language_loss": 0.92303932, "learning_rate": 3.850028117764539e-06, "loss": 0.94472188, "num_input_tokens_seen": 44594860, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.84375, "step": 2065, "time_per_iteration": 2.410625696182251 }, { "auxiliary_loss_clip": 0.01117973, "auxiliary_loss_mlp": 0.01043903, "balance_loss_clip": 1.01925051, "balance_loss_mlp": 1.03079259, "epoch": 0.12421464001202465, "flos": 23658543313920.0, "grad_norm": 1.8623053813568275, "language_loss": 0.80406475, "learning_rate": 3.849884537630196e-06, "loss": 0.82568353, "num_input_tokens_seen": 44614780, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.875, "step": 2066, "time_per_iteration": 2.4238030910491943 }, { "auxiliary_loss_clip": 0.01035428, "auxiliary_loss_mlp": 0.01008362, "balance_loss_clip": 1.00404668, "balance_loss_mlp": 1.00907445, "epoch": 0.12427476326469263, "flos": 65729440604160.0, "grad_norm": 0.8776658503758344, "language_loss": 0.63336056, "learning_rate": 3.849740891478233e-06, "loss": 0.65379852, "num_input_tokens_seen": 44671240, "router_z_loss_clip": 0.04321289, "router_z_loss_mlp": 0.26367188, "step": 2067, "time_per_iteration": 2.973073720932007 }, { "auxiliary_loss_clip": 0.01116555, "auxiliary_loss_mlp": 0.01042225, "balance_loss_clip": 1.01896691, "balance_loss_mlp": 1.03140807, "epoch": 0.12433488651736059, "flos": 24534269746560.0, "grad_norm": 2.794207783635333, "language_loss": 0.9301703, "learning_rate": 3.849597179313775e-06, "loss": 0.95175815, "num_input_tokens_seen": 44691050, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.8515625, "step": 2068, "time_per_iteration": 2.4296932220458984 }, { "auxiliary_loss_clip": 0.01119675, "auxiliary_loss_mlp": 0.01043389, "balance_loss_clip": 1.02181268, "balance_loss_mlp": 1.03442335, "epoch": 0.12439500977002856, "flos": 21030630877440.0, "grad_norm": 1.8576322137631927, "language_loss": 0.81259358, "learning_rate": 3.849453401141952e-06, "loss": 0.83422422, "num_input_tokens_seen": 44709850, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.8515625, "step": 2069, "time_per_iteration": 3.7766854763031006 }, { "auxiliary_loss_clip": 0.0111926, "auxiliary_loss_mlp": 0.0105096, "balance_loss_clip": 1.02696347, "balance_loss_mlp": 1.03305292, "epoch": 0.12445513302269653, "flos": 26829495037440.0, "grad_norm": 1.798662539204355, "language_loss": 0.77407026, "learning_rate": 3.8493095569678945e-06, "loss": 0.79577243, "num_input_tokens_seen": 44731475, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.86328125, "step": 2070, "time_per_iteration": 2.4453954696655273 }, { "auxiliary_loss_clip": 0.01119335, "auxiliary_loss_mlp": 0.01041812, "balance_loss_clip": 1.01684988, "balance_loss_mlp": 1.03424931, "epoch": 0.1245152562753645, "flos": 18367944860160.0, "grad_norm": 2.31678494920857, "language_loss": 0.8035953, "learning_rate": 3.849165646796735e-06, "loss": 0.82520676, "num_input_tokens_seen": 44749685, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.8515625, "step": 2071, "time_per_iteration": 2.3660426139831543 }, { "auxiliary_loss_clip": 0.01118492, "auxiliary_loss_mlp": 0.01049068, "balance_loss_clip": 1.02392673, "balance_loss_mlp": 1.03493595, "epoch": 0.12457537952803246, "flos": 33106634179200.0, "grad_norm": 1.6516694868819906, "language_loss": 0.7830193, "learning_rate": 3.849021670633611e-06, "loss": 0.80469489, "num_input_tokens_seen": 44772165, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.8359375, "step": 2072, "time_per_iteration": 5.2420947551727295 }, { "auxiliary_loss_clip": 0.01117737, "auxiliary_loss_mlp": 0.01051804, "balance_loss_clip": 1.02922606, "balance_loss_mlp": 1.03530848, "epoch": 0.12463550278070043, "flos": 22269209736960.0, "grad_norm": 2.4421081395701836, "language_loss": 0.74980325, "learning_rate": 3.8488776284836595e-06, "loss": 0.77149862, "num_input_tokens_seen": 44790580, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.8203125, "step": 2073, "time_per_iteration": 3.8428778648376465 }, { "auxiliary_loss_clip": 0.01116143, "auxiliary_loss_mlp": 0.01049777, "balance_loss_clip": 1.02605462, "balance_loss_mlp": 1.03189266, "epoch": 0.12469562603336841, "flos": 14678288933760.0, "grad_norm": 2.1570640646911725, "language_loss": 0.90657204, "learning_rate": 3.8487335203520215e-06, "loss": 0.92823124, "num_input_tokens_seen": 44806730, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.84375, "step": 2074, "time_per_iteration": 2.376634359359741 }, { "auxiliary_loss_clip": 0.01118504, "auxiliary_loss_mlp": 0.01048479, "balance_loss_clip": 1.0228374, "balance_loss_mlp": 1.03194141, "epoch": 0.12475574928603637, "flos": 24643617724800.0, "grad_norm": 2.37343051951324, "language_loss": 0.83716631, "learning_rate": 3.84858934624384e-06, "loss": 0.85883617, "num_input_tokens_seen": 44825550, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.86328125, "step": 2075, "time_per_iteration": 2.3994948863983154 }, { "auxiliary_loss_clip": 0.01117635, "auxiliary_loss_mlp": 0.01050142, "balance_loss_clip": 1.02473879, "balance_loss_mlp": 1.03250575, "epoch": 0.12481587253870434, "flos": 21761886637440.0, "grad_norm": 2.710930549668586, "language_loss": 0.73307014, "learning_rate": 3.8484451061642585e-06, "loss": 0.75474799, "num_input_tokens_seen": 44844155, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.8515625, "step": 2076, "time_per_iteration": 2.40032696723938 }, { "auxiliary_loss_clip": 0.01115359, "auxiliary_loss_mlp": 0.0104041, "balance_loss_clip": 1.01834464, "balance_loss_mlp": 1.03534937, "epoch": 0.12487599579137232, "flos": 21431503641600.0, "grad_norm": 1.7213159578200155, "language_loss": 0.75646508, "learning_rate": 3.8483008001184275e-06, "loss": 0.77802277, "num_input_tokens_seen": 44863780, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.80078125, "step": 2077, "time_per_iteration": 2.4050636291503906 }, { "auxiliary_loss_clip": 0.01114992, "auxiliary_loss_mlp": 0.0104251, "balance_loss_clip": 1.01846504, "balance_loss_mlp": 1.03148556, "epoch": 0.12493611904404028, "flos": 16106690188800.0, "grad_norm": 2.7697005372158348, "language_loss": 0.81957054, "learning_rate": 3.848156428111495e-06, "loss": 0.84114563, "num_input_tokens_seen": 44881480, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.8359375, "step": 2078, "time_per_iteration": 2.392759323120117 }, { "auxiliary_loss_clip": 0.01119904, "auxiliary_loss_mlp": 0.01045197, "balance_loss_clip": 1.02247524, "balance_loss_mlp": 1.0352838, "epoch": 0.12499624229670825, "flos": 21579186159360.0, "grad_norm": 1.691066522987815, "language_loss": 0.75001132, "learning_rate": 3.8480119901486135e-06, "loss": 0.77166235, "num_input_tokens_seen": 44900390, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.84765625, "step": 2079, "time_per_iteration": 2.3888022899627686 }, { "auxiliary_loss_clip": 0.01123983, "auxiliary_loss_mlp": 0.01051593, "balance_loss_clip": 1.02683294, "balance_loss_mlp": 1.03642082, "epoch": 0.1250563655493762, "flos": 25697960006400.0, "grad_norm": 2.096393689326478, "language_loss": 0.8320049, "learning_rate": 3.847867486234937e-06, "loss": 0.8537606, "num_input_tokens_seen": 44920375, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.875, "step": 2080, "time_per_iteration": 2.4352903366088867 }, { "auxiliary_loss_clip": 0.01116862, "auxiliary_loss_mlp": 0.01050311, "balance_loss_clip": 1.02584922, "balance_loss_mlp": 1.03381598, "epoch": 0.12511648880204418, "flos": 16908575362560.0, "grad_norm": 2.0247473194895234, "language_loss": 0.84366202, "learning_rate": 3.847722916375624e-06, "loss": 0.8653338, "num_input_tokens_seen": 44938415, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.828125, "step": 2081, "time_per_iteration": 2.3475122451782227 }, { "auxiliary_loss_clip": 0.01116416, "auxiliary_loss_mlp": 0.01041025, "balance_loss_clip": 1.01849461, "balance_loss_mlp": 1.03229547, "epoch": 0.12517661205471217, "flos": 17566513534080.0, "grad_norm": 1.7118357765139873, "language_loss": 0.76701432, "learning_rate": 3.847578280575832e-06, "loss": 0.78858876, "num_input_tokens_seen": 44957135, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.83984375, "step": 2082, "time_per_iteration": 2.3789234161376953 }, { "auxiliary_loss_clip": 0.01127754, "auxiliary_loss_mlp": 0.01042905, "balance_loss_clip": 1.01657128, "balance_loss_mlp": 1.03636873, "epoch": 0.12523673530738014, "flos": 16032883841280.0, "grad_norm": 2.459898948127579, "language_loss": 0.79046977, "learning_rate": 3.847433578840725e-06, "loss": 0.81217635, "num_input_tokens_seen": 44974480, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.9140625, "step": 2083, "time_per_iteration": 2.355137586593628 }, { "auxiliary_loss_clip": 0.01117899, "auxiliary_loss_mlp": 0.01042609, "balance_loss_clip": 1.0177182, "balance_loss_mlp": 1.0319339, "epoch": 0.1252968585600481, "flos": 18806733227520.0, "grad_norm": 3.531654342694931, "language_loss": 0.90191615, "learning_rate": 3.847288811175465e-06, "loss": 0.92352128, "num_input_tokens_seen": 44990310, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.859375, "step": 2084, "time_per_iteration": 2.3714373111724854 }, { "auxiliary_loss_clip": 0.01118012, "auxiliary_loss_mlp": 0.01041976, "balance_loss_clip": 1.01832533, "balance_loss_mlp": 1.03389311, "epoch": 0.12535698181271607, "flos": 27270343175040.0, "grad_norm": 1.9371363425061896, "language_loss": 0.7973994, "learning_rate": 3.84714397758522e-06, "loss": 0.81899923, "num_input_tokens_seen": 45010720, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.84375, "step": 2085, "time_per_iteration": 2.435650587081909 }, { "auxiliary_loss_clip": 0.01115806, "auxiliary_loss_mlp": 0.0104591, "balance_loss_clip": 1.0222826, "balance_loss_mlp": 1.0319972, "epoch": 0.12541710506538403, "flos": 22053027398400.0, "grad_norm": 1.941421465247455, "language_loss": 0.88167977, "learning_rate": 3.846999078075156e-06, "loss": 0.90329695, "num_input_tokens_seen": 45030360, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.8359375, "step": 2086, "time_per_iteration": 2.3734970092773438 }, { "auxiliary_loss_clip": 0.01112312, "auxiliary_loss_mlp": 0.01044926, "balance_loss_clip": 1.02293229, "balance_loss_mlp": 1.03155839, "epoch": 0.125477228318052, "flos": 12602388003840.0, "grad_norm": 2.0588020364240602, "language_loss": 0.87000966, "learning_rate": 3.8468541126504476e-06, "loss": 0.89158201, "num_input_tokens_seen": 45045085, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.80859375, "step": 2087, "time_per_iteration": 2.3449716567993164 }, { "auxiliary_loss_clip": 0.01116882, "auxiliary_loss_mlp": 0.01043699, "balance_loss_clip": 1.0192132, "balance_loss_mlp": 1.03264987, "epoch": 0.12553735157071996, "flos": 23877413827200.0, "grad_norm": 1.8887299433909066, "language_loss": 0.73232102, "learning_rate": 3.846709081316266e-06, "loss": 0.75392687, "num_input_tokens_seen": 45065145, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.84375, "step": 2088, "time_per_iteration": 2.399785280227661 }, { "auxiliary_loss_clip": 0.01038205, "auxiliary_loss_mlp": 0.01002933, "balance_loss_clip": 0.9987132, "balance_loss_mlp": 1.01034844, "epoch": 0.12559747482338796, "flos": 69917482321920.0, "grad_norm": 0.7564133700942366, "language_loss": 0.61721826, "learning_rate": 3.846563984077788e-06, "loss": 0.63762963, "num_input_tokens_seen": 45126230, "router_z_loss_clip": 0.04223633, "router_z_loss_mlp": 0.27734375, "step": 2089, "time_per_iteration": 3.0131125450134277 }, { "auxiliary_loss_clip": 0.01112258, "auxiliary_loss_mlp": 0.01043145, "balance_loss_clip": 1.02019763, "balance_loss_mlp": 1.03125, "epoch": 0.12565759807605592, "flos": 24278426236800.0, "grad_norm": 3.37125939911503, "language_loss": 0.77657014, "learning_rate": 3.846418820940191e-06, "loss": 0.79812419, "num_input_tokens_seen": 45145545, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.80859375, "step": 2090, "time_per_iteration": 2.400271415710449 }, { "auxiliary_loss_clip": 0.01036273, "auxiliary_loss_mlp": 0.01004513, "balance_loss_clip": 1.00050783, "balance_loss_mlp": 1.00848293, "epoch": 0.12571772132872389, "flos": 56448375016320.0, "grad_norm": 0.7524356481684067, "language_loss": 0.59461302, "learning_rate": 3.846273591908656e-06, "loss": 0.61502087, "num_input_tokens_seen": 45206845, "router_z_loss_clip": 0.04003906, "router_z_loss_mlp": 0.27734375, "step": 2091, "time_per_iteration": 2.971236228942871 }, { "auxiliary_loss_clip": 0.0111571, "auxiliary_loss_mlp": 0.01041371, "balance_loss_clip": 1.01947236, "balance_loss_mlp": 1.03398085, "epoch": 0.12577784458139185, "flos": 41244225050880.0, "grad_norm": 2.0347088973012024, "language_loss": 0.6315937, "learning_rate": 3.846128296988365e-06, "loss": 0.65316451, "num_input_tokens_seen": 45228495, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.81640625, "step": 2092, "time_per_iteration": 2.5666489601135254 }, { "auxiliary_loss_clip": 0.01119389, "auxiliary_loss_mlp": 0.0105116, "balance_loss_clip": 1.02620912, "balance_loss_mlp": 1.03349912, "epoch": 0.12583796783405982, "flos": 19754485528320.0, "grad_norm": 4.127986859516993, "language_loss": 0.80670291, "learning_rate": 3.845982936184505e-06, "loss": 0.82840842, "num_input_tokens_seen": 45245720, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.859375, "step": 2093, "time_per_iteration": 2.372065305709839 }, { "auxiliary_loss_clip": 0.01117134, "auxiliary_loss_mlp": 0.01052396, "balance_loss_clip": 1.02943611, "balance_loss_mlp": 1.03341269, "epoch": 0.12589809108672778, "flos": 22600989187200.0, "grad_norm": 1.8013595234153057, "language_loss": 0.75998724, "learning_rate": 3.845837509502262e-06, "loss": 0.78168249, "num_input_tokens_seen": 45265650, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.8359375, "step": 2094, "time_per_iteration": 2.4176156520843506 }, { "auxiliary_loss_clip": 0.01112457, "auxiliary_loss_mlp": 0.01048578, "balance_loss_clip": 1.02617884, "balance_loss_mlp": 1.03079164, "epoch": 0.12595821433939577, "flos": 45221111665920.0, "grad_norm": 1.9134547374868065, "language_loss": 0.76899022, "learning_rate": 3.845692016946826e-06, "loss": 0.79060054, "num_input_tokens_seen": 45287790, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.81640625, "step": 2095, "time_per_iteration": 2.5881059169769287 }, { "auxiliary_loss_clip": 0.01116577, "auxiliary_loss_mlp": 0.01042894, "balance_loss_clip": 1.01988709, "balance_loss_mlp": 1.03235626, "epoch": 0.12601833759206374, "flos": 14318927642880.0, "grad_norm": 2.3136884654316052, "language_loss": 0.82832527, "learning_rate": 3.845546458523391e-06, "loss": 0.84991997, "num_input_tokens_seen": 45305720, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.84375, "step": 2096, "time_per_iteration": 2.38016676902771 }, { "auxiliary_loss_clip": 0.01114748, "auxiliary_loss_mlp": 0.01045874, "balance_loss_clip": 1.02244925, "balance_loss_mlp": 1.03167081, "epoch": 0.1260784608447317, "flos": 21287172614400.0, "grad_norm": 2.0683992218288885, "language_loss": 0.7564081, "learning_rate": 3.845400834237148e-06, "loss": 0.7780143, "num_input_tokens_seen": 45325290, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.83203125, "step": 2097, "time_per_iteration": 2.400761127471924 }, { "auxiliary_loss_clip": 0.01115307, "auxiliary_loss_mlp": 0.01049884, "balance_loss_clip": 1.0281167, "balance_loss_mlp": 1.0332402, "epoch": 0.12613858409739967, "flos": 26250076247040.0, "grad_norm": 3.3089693939595457, "language_loss": 0.8743059, "learning_rate": 3.8452551440932975e-06, "loss": 0.89595783, "num_input_tokens_seen": 45344465, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.8203125, "step": 2098, "time_per_iteration": 2.4255878925323486 }, { "auxiliary_loss_clip": 0.01119626, "auxiliary_loss_mlp": 0.01057123, "balance_loss_clip": 1.03171968, "balance_loss_mlp": 1.03246355, "epoch": 0.12619870735006763, "flos": 21578906868480.0, "grad_norm": 1.9861322343369732, "language_loss": 0.69507301, "learning_rate": 3.8451093880970365e-06, "loss": 0.71684051, "num_input_tokens_seen": 45362465, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.87109375, "step": 2099, "time_per_iteration": 2.3840246200561523 }, { "auxiliary_loss_clip": 0.01116001, "auxiliary_loss_mlp": 0.0105122, "balance_loss_clip": 1.02680588, "balance_loss_mlp": 1.03224981, "epoch": 0.1262588306027356, "flos": 23365936275840.0, "grad_norm": 2.4534387303493603, "language_loss": 0.81588322, "learning_rate": 3.844963566253569e-06, "loss": 0.83755541, "num_input_tokens_seen": 45382700, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.83984375, "step": 2100, "time_per_iteration": 2.4223971366882324 }, { "auxiliary_loss_clip": 0.01119178, "auxiliary_loss_mlp": 0.01048629, "balance_loss_clip": 1.0248704, "balance_loss_mlp": 1.03249133, "epoch": 0.12631895385540357, "flos": 23948113063680.0, "grad_norm": 1.9546686395482318, "language_loss": 0.80489665, "learning_rate": 3.844817678568097e-06, "loss": 0.82657468, "num_input_tokens_seen": 45401005, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.8671875, "step": 2101, "time_per_iteration": 2.4068095684051514 }, { "auxiliary_loss_clip": 0.01033474, "auxiliary_loss_mlp": 0.01005857, "balance_loss_clip": 1.0021373, "balance_loss_mlp": 1.00711823, "epoch": 0.12637907710807156, "flos": 70278868471680.0, "grad_norm": 0.7011092592635109, "language_loss": 0.57050014, "learning_rate": 3.8446717250458275e-06, "loss": 0.59089339, "num_input_tokens_seen": 45466555, "router_z_loss_clip": 0.03710938, "router_z_loss_mlp": 0.26367188, "step": 2102, "time_per_iteration": 3.098021984100342 }, { "auxiliary_loss_clip": 0.01117671, "auxiliary_loss_mlp": 0.01044396, "balance_loss_clip": 1.02066135, "balance_loss_mlp": 1.03210449, "epoch": 0.12643920036073952, "flos": 18914126169600.0, "grad_norm": 2.1890385512611754, "language_loss": 0.93189907, "learning_rate": 3.844525705691969e-06, "loss": 0.9535197, "num_input_tokens_seen": 45485165, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.85546875, "step": 2103, "time_per_iteration": 2.3683717250823975 }, { "auxiliary_loss_clip": 0.01109523, "auxiliary_loss_mlp": 0.01034552, "balance_loss_clip": 1.01373827, "balance_loss_mlp": 1.03146935, "epoch": 0.1264993236134075, "flos": 27781227233280.0, "grad_norm": 2.0237792358659945, "language_loss": 0.77780366, "learning_rate": 3.844379620511733e-06, "loss": 0.7992444, "num_input_tokens_seen": 45504630, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.78125, "step": 2104, "time_per_iteration": 2.473353385925293 }, { "auxiliary_loss_clip": 0.011191, "auxiliary_loss_mlp": 0.01046612, "balance_loss_clip": 1.02412975, "balance_loss_mlp": 1.03558755, "epoch": 0.12655944686607545, "flos": 24753524284800.0, "grad_norm": 3.309421676063958, "language_loss": 0.81139278, "learning_rate": 3.844233469510333e-06, "loss": 0.83304989, "num_input_tokens_seen": 45524885, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.8359375, "step": 2105, "time_per_iteration": 2.4122915267944336 }, { "auxiliary_loss_clip": 0.01121015, "auxiliary_loss_mlp": 0.01043026, "balance_loss_clip": 1.0182066, "balance_loss_mlp": 1.03295863, "epoch": 0.12661957011874342, "flos": 24131930705280.0, "grad_norm": 2.622898736258822, "language_loss": 0.83117187, "learning_rate": 3.844087252692984e-06, "loss": 0.85281229, "num_input_tokens_seen": 45545000, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.8828125, "step": 2106, "time_per_iteration": 2.431649684906006 }, { "auxiliary_loss_clip": 0.01115561, "auxiliary_loss_mlp": 0.01042881, "balance_loss_clip": 1.01909912, "balance_loss_mlp": 1.03455234, "epoch": 0.12667969337141138, "flos": 24568519656960.0, "grad_norm": 1.8570276402480308, "language_loss": 0.7331838, "learning_rate": 3.843940970064904e-06, "loss": 0.75476825, "num_input_tokens_seen": 45564210, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.80859375, "step": 2107, "time_per_iteration": 2.404420852661133 }, { "auxiliary_loss_clip": 0.0111377, "auxiliary_loss_mlp": 0.01038722, "balance_loss_clip": 1.01615608, "balance_loss_mlp": 1.03279042, "epoch": 0.12673981662407935, "flos": 22960699591680.0, "grad_norm": 1.886671777421692, "language_loss": 0.78911781, "learning_rate": 3.843794621631314e-06, "loss": 0.81064278, "num_input_tokens_seen": 45583030, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.8125, "step": 2108, "time_per_iteration": 2.4051084518432617 }, { "auxiliary_loss_clip": 0.01114943, "auxiliary_loss_mlp": 0.01042829, "balance_loss_clip": 1.02022719, "balance_loss_mlp": 1.03060329, "epoch": 0.12679993987674734, "flos": 17273906496000.0, "grad_norm": 1.9965757902599248, "language_loss": 0.75592458, "learning_rate": 3.843648207397438e-06, "loss": 0.7775023, "num_input_tokens_seen": 45602265, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.84375, "step": 2109, "time_per_iteration": 3.766605854034424 }, { "auxiliary_loss_clip": 0.0111483, "auxiliary_loss_mlp": 0.01044969, "balance_loss_clip": 1.02223563, "balance_loss_mlp": 1.0322988, "epoch": 0.1268600631294153, "flos": 17274115964160.0, "grad_norm": 1.7273210348148718, "language_loss": 0.8307693, "learning_rate": 3.843501727368498e-06, "loss": 0.85236728, "num_input_tokens_seen": 45620595, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.82421875, "step": 2110, "time_per_iteration": 2.4035935401916504 }, { "auxiliary_loss_clip": 0.01112926, "auxiliary_loss_mlp": 0.01041728, "balance_loss_clip": 1.01898241, "balance_loss_mlp": 1.03165388, "epoch": 0.12692018638208327, "flos": 24059904837120.0, "grad_norm": 1.7158466888913007, "language_loss": 0.78610981, "learning_rate": 3.8433551815497255e-06, "loss": 0.80765629, "num_input_tokens_seen": 45641140, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.81640625, "step": 2111, "time_per_iteration": 5.266538381576538 }, { "auxiliary_loss_clip": 0.01123097, "auxiliary_loss_mlp": 0.0105393, "balance_loss_clip": 1.027632, "balance_loss_mlp": 1.03377807, "epoch": 0.12698030963475124, "flos": 31830558652800.0, "grad_norm": 2.513129431042387, "language_loss": 0.76426053, "learning_rate": 3.843208569946347e-06, "loss": 0.78603077, "num_input_tokens_seen": 45662315, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 0.89453125, "step": 2112, "time_per_iteration": 2.4973158836364746 }, { "auxiliary_loss_clip": 0.01114453, "auxiliary_loss_mlp": 0.0105147, "balance_loss_clip": 1.0288676, "balance_loss_mlp": 1.03119397, "epoch": 0.1270404328874192, "flos": 25186691923200.0, "grad_norm": 1.7756504212212387, "language_loss": 0.85513252, "learning_rate": 3.843061892563596e-06, "loss": 0.87679178, "num_input_tokens_seen": 45680335, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.83203125, "step": 2113, "time_per_iteration": 3.8007261753082275 }, { "auxiliary_loss_clip": 0.0111456, "auxiliary_loss_mlp": 0.01046979, "balance_loss_clip": 1.02329206, "balance_loss_mlp": 1.03161609, "epoch": 0.12710055614008717, "flos": 15996434515200.0, "grad_norm": 2.1560867935703585, "language_loss": 0.73853689, "learning_rate": 3.842915149406707e-06, "loss": 0.76015228, "num_input_tokens_seen": 45696240, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.828125, "step": 2114, "time_per_iteration": 2.369175672531128 }, { "auxiliary_loss_clip": 0.01118727, "auxiliary_loss_mlp": 0.0104384, "balance_loss_clip": 1.02104712, "balance_loss_mlp": 1.03434312, "epoch": 0.12716067939275516, "flos": 15084747515520.0, "grad_norm": 1.9327626940509444, "language_loss": 0.83024955, "learning_rate": 3.842768340480917e-06, "loss": 0.85187531, "num_input_tokens_seen": 45713695, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.84375, "step": 2115, "time_per_iteration": 2.3644826412200928 }, { "auxiliary_loss_clip": 0.0111886, "auxiliary_loss_mlp": 0.01048808, "balance_loss_clip": 1.02586019, "balance_loss_mlp": 1.03423762, "epoch": 0.12722080264542313, "flos": 28365463791360.0, "grad_norm": 1.6253823017253595, "language_loss": 0.86538076, "learning_rate": 3.8426214657914656e-06, "loss": 0.88705742, "num_input_tokens_seen": 45736655, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.84375, "step": 2116, "time_per_iteration": 2.4714713096618652 }, { "auxiliary_loss_clip": 0.01114616, "auxiliary_loss_mlp": 0.01041541, "balance_loss_clip": 1.01851034, "balance_loss_mlp": 1.03221858, "epoch": 0.1272809258980911, "flos": 32378520441600.0, "grad_norm": 1.7531392172531437, "language_loss": 0.70339799, "learning_rate": 3.842474525343594e-06, "loss": 0.72495955, "num_input_tokens_seen": 45758195, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.82421875, "step": 2117, "time_per_iteration": 2.485377788543701 }, { "auxiliary_loss_clip": 0.01114204, "auxiliary_loss_mlp": 0.01045505, "balance_loss_clip": 1.02188969, "balance_loss_mlp": 1.03143668, "epoch": 0.12734104915075906, "flos": 16033477334400.0, "grad_norm": 1.9788775424089131, "language_loss": 0.86027038, "learning_rate": 3.842327519142545e-06, "loss": 0.88186753, "num_input_tokens_seen": 45774280, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.828125, "step": 2118, "time_per_iteration": 2.361762523651123 }, { "auxiliary_loss_clip": 0.01112817, "auxiliary_loss_mlp": 0.01045058, "balance_loss_clip": 1.02202654, "balance_loss_mlp": 1.03070664, "epoch": 0.12740117240342702, "flos": 18259330020480.0, "grad_norm": 2.068224738756466, "language_loss": 0.87385684, "learning_rate": 3.842180447193566e-06, "loss": 0.89543557, "num_input_tokens_seen": 45792760, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.8203125, "step": 2119, "time_per_iteration": 2.36799693107605 }, { "auxiliary_loss_clip": 0.01116089, "auxiliary_loss_mlp": 0.01042747, "balance_loss_clip": 1.01886976, "balance_loss_mlp": 1.031811, "epoch": 0.127461295656095, "flos": 12121215379200.0, "grad_norm": 4.792706132267744, "language_loss": 0.87717366, "learning_rate": 3.842033309501905e-06, "loss": 0.89876205, "num_input_tokens_seen": 45804300, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.84375, "step": 2120, "time_per_iteration": 2.314236640930176 }, { "auxiliary_loss_clip": 0.01111922, "auxiliary_loss_mlp": 0.01036893, "balance_loss_clip": 1.01433849, "balance_loss_mlp": 1.03037643, "epoch": 0.12752141890876295, "flos": 23147973457920.0, "grad_norm": 1.9133798996998994, "language_loss": 0.75380892, "learning_rate": 3.841886106072815e-06, "loss": 0.77529705, "num_input_tokens_seen": 45823780, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.8125, "step": 2121, "time_per_iteration": 2.3969669342041016 }, { "auxiliary_loss_clip": 0.01111711, "auxiliary_loss_mlp": 0.01044018, "balance_loss_clip": 1.02122521, "balance_loss_mlp": 1.03040743, "epoch": 0.12758154216143094, "flos": 21614937258240.0, "grad_norm": 2.479920683180096, "language_loss": 0.83177739, "learning_rate": 3.841738836911547e-06, "loss": 0.85333467, "num_input_tokens_seen": 45840495, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.8125, "step": 2122, "time_per_iteration": 2.3829691410064697 }, { "auxiliary_loss_clip": 0.01115723, "auxiliary_loss_mlp": 0.01043598, "balance_loss_clip": 1.02047181, "balance_loss_mlp": 1.03254235, "epoch": 0.1276416654140989, "flos": 15923954799360.0, "grad_norm": 4.716823850097833, "language_loss": 0.79111636, "learning_rate": 3.8415915020233574e-06, "loss": 0.81270957, "num_input_tokens_seen": 45857735, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.83203125, "step": 2123, "time_per_iteration": 2.3637309074401855 }, { "auxiliary_loss_clip": 0.01115774, "auxiliary_loss_mlp": 0.01051057, "balance_loss_clip": 1.02757287, "balance_loss_mlp": 1.03227031, "epoch": 0.12770178866676687, "flos": 22381595003520.0, "grad_norm": 1.6150056217743856, "language_loss": 0.78939128, "learning_rate": 3.8414441014135045e-06, "loss": 0.81105959, "num_input_tokens_seen": 45876485, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.8359375, "step": 2124, "time_per_iteration": 2.377854108810425 }, { "auxiliary_loss_clip": 0.01115417, "auxiliary_loss_mlp": 0.01044485, "balance_loss_clip": 1.02244282, "balance_loss_mlp": 1.02979326, "epoch": 0.12776191191943484, "flos": 21651421495680.0, "grad_norm": 2.004458347345125, "language_loss": 0.75415641, "learning_rate": 3.8412966350872475e-06, "loss": 0.77575541, "num_input_tokens_seen": 45894645, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.85546875, "step": 2125, "time_per_iteration": 2.3942389488220215 }, { "auxiliary_loss_clip": 0.01112755, "auxiliary_loss_mlp": 0.01042692, "balance_loss_clip": 1.02105582, "balance_loss_mlp": 1.03033376, "epoch": 0.1278220351721028, "flos": 25734479155200.0, "grad_norm": 2.3313958258670318, "language_loss": 0.77859557, "learning_rate": 3.841149103049851e-06, "loss": 0.80015004, "num_input_tokens_seen": 45913755, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.82421875, "step": 2126, "time_per_iteration": 2.4090161323547363 }, { "auxiliary_loss_clip": 0.01112666, "auxiliary_loss_mlp": 0.01045773, "balance_loss_clip": 1.0235647, "balance_loss_mlp": 1.03223729, "epoch": 0.12788215842477077, "flos": 41241676521600.0, "grad_norm": 1.5875149616149478, "language_loss": 0.69364333, "learning_rate": 3.8410015053065785e-06, "loss": 0.71522772, "num_input_tokens_seen": 45936095, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.8046875, "step": 2127, "time_per_iteration": 2.5647130012512207 }, { "auxiliary_loss_clip": 0.01032074, "auxiliary_loss_mlp": 0.01005287, "balance_loss_clip": 1.00142455, "balance_loss_mlp": 1.00704312, "epoch": 0.12794228167743876, "flos": 70873822817280.0, "grad_norm": 0.8476597553826426, "language_loss": 0.62836862, "learning_rate": 3.8408538418626985e-06, "loss": 0.6487422, "num_input_tokens_seen": 46004655, "router_z_loss_clip": 0.03857422, "router_z_loss_mlp": 0.25, "step": 2128, "time_per_iteration": 3.063953399658203 }, { "auxiliary_loss_clip": 0.01113648, "auxiliary_loss_mlp": 0.01036754, "balance_loss_clip": 1.01316237, "balance_loss_mlp": 1.03077292, "epoch": 0.12800240493010673, "flos": 16288797173760.0, "grad_norm": 2.753633175846378, "language_loss": 0.77115464, "learning_rate": 3.840706112723479e-06, "loss": 0.79265857, "num_input_tokens_seen": 46023610, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.828125, "step": 2129, "time_per_iteration": 2.3833889961242676 }, { "auxiliary_loss_clip": 0.0112229, "auxiliary_loss_mlp": 0.01046874, "balance_loss_clip": 1.02174425, "balance_loss_mlp": 1.03358889, "epoch": 0.1280625281827747, "flos": 20630491251840.0, "grad_norm": 2.0155953904423067, "language_loss": 0.79008496, "learning_rate": 3.840558317894194e-06, "loss": 0.81177664, "num_input_tokens_seen": 46041725, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.88671875, "step": 2130, "time_per_iteration": 2.378575325012207 }, { "auxiliary_loss_clip": 0.01115906, "auxiliary_loss_mlp": 0.01041164, "balance_loss_clip": 1.01815629, "balance_loss_mlp": 1.03099012, "epoch": 0.12812265143544266, "flos": 22637124311040.0, "grad_norm": 2.2626543637482257, "language_loss": 0.70868599, "learning_rate": 3.840410457380117e-06, "loss": 0.73025668, "num_input_tokens_seen": 46061095, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.8515625, "step": 2131, "time_per_iteration": 2.4017488956451416 }, { "auxiliary_loss_clip": 0.01116169, "auxiliary_loss_mlp": 0.01041838, "balance_loss_clip": 1.01891422, "balance_loss_mlp": 1.03228021, "epoch": 0.12818277468811062, "flos": 34713267258240.0, "grad_norm": 2.4095980582332497, "language_loss": 0.72449213, "learning_rate": 3.840262531186525e-06, "loss": 0.74607217, "num_input_tokens_seen": 46082670, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.83984375, "step": 2132, "time_per_iteration": 2.497260332107544 }, { "auxiliary_loss_clip": 0.01112032, "auxiliary_loss_mlp": 0.01044994, "balance_loss_clip": 1.02172446, "balance_loss_mlp": 1.03138447, "epoch": 0.1282428979407786, "flos": 23111000461440.0, "grad_norm": 2.3302926949069236, "language_loss": 0.82523346, "learning_rate": 3.840114539318697e-06, "loss": 0.84680378, "num_input_tokens_seen": 46102410, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.80859375, "step": 2133, "time_per_iteration": 2.3946878910064697 }, { "auxiliary_loss_clip": 0.01120332, "auxiliary_loss_mlp": 0.01052204, "balance_loss_clip": 1.02770627, "balance_loss_mlp": 1.03165603, "epoch": 0.12830302119344655, "flos": 20885461977600.0, "grad_norm": 2.325376780580096, "language_loss": 0.79481399, "learning_rate": 3.839966481781914e-06, "loss": 0.81653935, "num_input_tokens_seen": 46121145, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.890625, "step": 2134, "time_per_iteration": 2.3721272945404053 }, { "auxiliary_loss_clip": 0.01116204, "auxiliary_loss_mlp": 0.01045534, "balance_loss_clip": 1.02231169, "balance_loss_mlp": 1.03271592, "epoch": 0.12836314444611455, "flos": 21396695149440.0, "grad_norm": 1.9570237272098825, "language_loss": 0.82733893, "learning_rate": 3.83981835858146e-06, "loss": 0.84895641, "num_input_tokens_seen": 46140740, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.8359375, "step": 2135, "time_per_iteration": 2.391282081604004 }, { "auxiliary_loss_clip": 0.01112881, "auxiliary_loss_mlp": 0.01050256, "balance_loss_clip": 1.02685499, "balance_loss_mlp": 1.03148592, "epoch": 0.1284232676987825, "flos": 13661617875840.0, "grad_norm": 2.3054997403547475, "language_loss": 0.77193314, "learning_rate": 3.839670169722622e-06, "loss": 0.79356444, "num_input_tokens_seen": 46156805, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.8125, "step": 2136, "time_per_iteration": 2.342874050140381 }, { "auxiliary_loss_clip": 0.01033939, "auxiliary_loss_mlp": 0.01001595, "balance_loss_clip": 0.99828076, "balance_loss_mlp": 1.00933623, "epoch": 0.12848339095145048, "flos": 59991709968000.0, "grad_norm": 0.891959578830437, "language_loss": 0.59144431, "learning_rate": 3.839521915210688e-06, "loss": 0.6117996, "num_input_tokens_seen": 46222085, "router_z_loss_clip": 0.03320312, "router_z_loss_mlp": 0.24609375, "step": 2137, "time_per_iteration": 3.1718711853027344 }, { "auxiliary_loss_clip": 0.01112519, "auxiliary_loss_mlp": 0.01044892, "balance_loss_clip": 1.02251649, "balance_loss_mlp": 1.02958333, "epoch": 0.12854351420411844, "flos": 13880523300480.0, "grad_norm": 3.2122022524145843, "language_loss": 0.82409132, "learning_rate": 3.839373595050948e-06, "loss": 0.84566545, "num_input_tokens_seen": 46239970, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.828125, "step": 2138, "time_per_iteration": 2.3435263633728027 }, { "auxiliary_loss_clip": 0.01116734, "auxiliary_loss_mlp": 0.01046684, "balance_loss_clip": 1.02316427, "balance_loss_mlp": 1.03054476, "epoch": 0.1286036374567864, "flos": 22636845020160.0, "grad_norm": 2.599591190734799, "language_loss": 0.78714335, "learning_rate": 3.839225209248696e-06, "loss": 0.80877751, "num_input_tokens_seen": 46257740, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.859375, "step": 2139, "time_per_iteration": 2.3830668926239014 }, { "auxiliary_loss_clip": 0.0111658, "auxiliary_loss_mlp": 0.01042747, "balance_loss_clip": 1.02007365, "balance_loss_mlp": 1.03078604, "epoch": 0.12866376070945437, "flos": 16323884956800.0, "grad_norm": 2.1733964329112556, "language_loss": 0.85316467, "learning_rate": 3.839076757809228e-06, "loss": 0.87475795, "num_input_tokens_seen": 46275445, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.859375, "step": 2140, "time_per_iteration": 2.3494696617126465 }, { "auxiliary_loss_clip": 0.01112102, "auxiliary_loss_mlp": 0.01040162, "balance_loss_clip": 1.01962209, "balance_loss_mlp": 1.02963066, "epoch": 0.12872388396212234, "flos": 11873750595840.0, "grad_norm": 2.4315846790103257, "language_loss": 0.85440862, "learning_rate": 3.83892824073784e-06, "loss": 0.87593126, "num_input_tokens_seen": 46291710, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.82421875, "step": 2141, "time_per_iteration": 2.3729195594787598 }, { "auxiliary_loss_clip": 0.01118117, "auxiliary_loss_mlp": 0.01043187, "balance_loss_clip": 1.01964295, "balance_loss_mlp": 1.03118086, "epoch": 0.12878400721479033, "flos": 28365428880000.0, "grad_norm": 2.0216933033230786, "language_loss": 0.6776073, "learning_rate": 3.838779658039834e-06, "loss": 0.69922036, "num_input_tokens_seen": 46311335, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.87109375, "step": 2142, "time_per_iteration": 2.4438652992248535 }, { "auxiliary_loss_clip": 0.01119154, "auxiliary_loss_mlp": 0.01039734, "balance_loss_clip": 1.01579714, "balance_loss_mlp": 1.03259099, "epoch": 0.1288441304674583, "flos": 25884430911360.0, "grad_norm": 1.9742993299275668, "language_loss": 0.83022559, "learning_rate": 3.838631009720513e-06, "loss": 0.85181445, "num_input_tokens_seen": 46330985, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.8671875, "step": 2143, "time_per_iteration": 2.43601131439209 }, { "auxiliary_loss_clip": 0.01118479, "auxiliary_loss_mlp": 0.01048113, "balance_loss_clip": 1.0251298, "balance_loss_mlp": 1.03445256, "epoch": 0.12890425372012626, "flos": 20812737882240.0, "grad_norm": 1.8165525936827422, "language_loss": 0.81771183, "learning_rate": 3.83848229578518e-06, "loss": 0.83937776, "num_input_tokens_seen": 46351295, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.83984375, "step": 2144, "time_per_iteration": 2.4065892696380615 }, { "auxiliary_loss_clip": 0.01114425, "auxiliary_loss_mlp": 0.01048728, "balance_loss_clip": 1.02498186, "balance_loss_mlp": 1.02971935, "epoch": 0.12896437697279423, "flos": 22564749329280.0, "grad_norm": 2.18530696944292, "language_loss": 0.78207135, "learning_rate": 3.838333516239142e-06, "loss": 0.80370283, "num_input_tokens_seen": 46368600, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.84765625, "step": 2145, "time_per_iteration": 2.3868374824523926 }, { "auxiliary_loss_clip": 0.01117629, "auxiliary_loss_mlp": 0.01048487, "balance_loss_clip": 1.02367997, "balance_loss_mlp": 1.03120184, "epoch": 0.1290245002254622, "flos": 17492811920640.0, "grad_norm": 2.506205522025062, "language_loss": 0.82470876, "learning_rate": 3.83818467108771e-06, "loss": 0.84636986, "num_input_tokens_seen": 46387370, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.8671875, "step": 2146, "time_per_iteration": 2.360330820083618 }, { "auxiliary_loss_clip": 0.01117362, "auxiliary_loss_mlp": 0.01044412, "balance_loss_clip": 1.02117813, "balance_loss_mlp": 1.03334451, "epoch": 0.12908462347813016, "flos": 36314593810560.0, "grad_norm": 3.324351171390452, "language_loss": 0.71070415, "learning_rate": 3.838035760336196e-06, "loss": 0.73232186, "num_input_tokens_seen": 46409570, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.83984375, "step": 2147, "time_per_iteration": 2.5244247913360596 }, { "auxiliary_loss_clip": 0.01115186, "auxiliary_loss_mlp": 0.01044679, "balance_loss_clip": 1.02238691, "balance_loss_mlp": 1.03071284, "epoch": 0.12914474673079815, "flos": 22527601776000.0, "grad_norm": 2.3566456445030193, "language_loss": 0.71779263, "learning_rate": 3.837886783989914e-06, "loss": 0.73939127, "num_input_tokens_seen": 46429320, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.84375, "step": 2148, "time_per_iteration": 3.7509665489196777 }, { "auxiliary_loss_clip": 0.01112953, "auxiliary_loss_mlp": 0.01037325, "balance_loss_clip": 1.01605821, "balance_loss_mlp": 1.03363156, "epoch": 0.12920486998346611, "flos": 21470780787840.0, "grad_norm": 1.4622631739764205, "language_loss": 0.78898561, "learning_rate": 3.837737742054179e-06, "loss": 0.81048834, "num_input_tokens_seen": 46450155, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.79296875, "step": 2149, "time_per_iteration": 2.397650718688965 }, { "auxiliary_loss_clip": 0.0111474, "auxiliary_loss_mlp": 0.01038862, "balance_loss_clip": 1.01599813, "balance_loss_mlp": 1.0322175, "epoch": 0.12926499323613408, "flos": 27307316171520.0, "grad_norm": 1.9198554036163238, "language_loss": 0.76388699, "learning_rate": 3.837588634534312e-06, "loss": 0.78542304, "num_input_tokens_seen": 46470280, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.82421875, "step": 2150, "time_per_iteration": 3.8543319702148438 }, { "auxiliary_loss_clip": 0.01115536, "auxiliary_loss_mlp": 0.01046366, "balance_loss_clip": 1.0236088, "balance_loss_mlp": 1.03230286, "epoch": 0.12932511648880204, "flos": 22090035306240.0, "grad_norm": 2.7470104760426186, "language_loss": 0.70360446, "learning_rate": 3.837439461435634e-06, "loss": 0.72522342, "num_input_tokens_seen": 46487605, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.83203125, "step": 2151, "time_per_iteration": 3.801199197769165 }, { "auxiliary_loss_clip": 0.01115444, "auxiliary_loss_mlp": 0.01038906, "balance_loss_clip": 1.0167923, "balance_loss_mlp": 1.03347862, "epoch": 0.12938523974147, "flos": 20301749089920.0, "grad_norm": 1.893864815881546, "language_loss": 0.84205532, "learning_rate": 3.837290222763467e-06, "loss": 0.86359888, "num_input_tokens_seen": 46505100, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.8203125, "step": 2152, "time_per_iteration": 3.7916481494903564 }, { "auxiliary_loss_clip": 0.01115379, "auxiliary_loss_mlp": 0.01048931, "balance_loss_clip": 1.02667511, "balance_loss_mlp": 1.03076506, "epoch": 0.12944536299413797, "flos": 19498956220800.0, "grad_norm": 1.7597086564245399, "language_loss": 0.78322285, "learning_rate": 3.837140918523139e-06, "loss": 0.80486596, "num_input_tokens_seen": 46524020, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.84765625, "step": 2153, "time_per_iteration": 2.3810129165649414 }, { "auxiliary_loss_clip": 0.01114693, "auxiliary_loss_mlp": 0.01046041, "balance_loss_clip": 1.02345061, "balance_loss_mlp": 1.03024554, "epoch": 0.12950548624680594, "flos": 27706722658560.0, "grad_norm": 1.5810401249289232, "language_loss": 0.80105108, "learning_rate": 3.836991548719977e-06, "loss": 0.82265842, "num_input_tokens_seen": 46544640, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.84375, "step": 2154, "time_per_iteration": 2.4317073822021484 }, { "auxiliary_loss_clip": 0.01117873, "auxiliary_loss_mlp": 0.01045428, "balance_loss_clip": 1.02213466, "balance_loss_mlp": 1.03387988, "epoch": 0.12956560949947393, "flos": 17564802877440.0, "grad_norm": 1.883418700162283, "language_loss": 0.83143741, "learning_rate": 3.836842113359312e-06, "loss": 0.85307044, "num_input_tokens_seen": 46561395, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.83984375, "step": 2155, "time_per_iteration": 2.3684208393096924 }, { "auxiliary_loss_clip": 0.01116601, "auxiliary_loss_mlp": 0.0104286, "balance_loss_clip": 1.01961398, "balance_loss_mlp": 1.03140187, "epoch": 0.1296257327521419, "flos": 20739664673280.0, "grad_norm": 2.60337306720873, "language_loss": 0.75139713, "learning_rate": 3.836692612446477e-06, "loss": 0.77299178, "num_input_tokens_seen": 46579395, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.8515625, "step": 2156, "time_per_iteration": 2.3783843517303467 }, { "auxiliary_loss_clip": 0.01110463, "auxiliary_loss_mlp": 0.01041525, "balance_loss_clip": 1.01924503, "balance_loss_mlp": 1.03018153, "epoch": 0.12968585600480986, "flos": 16394898395520.0, "grad_norm": 1.8736014366258005, "language_loss": 0.86187625, "learning_rate": 3.836543045986806e-06, "loss": 0.88339609, "num_input_tokens_seen": 46597090, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.80078125, "step": 2157, "time_per_iteration": 2.371201515197754 }, { "auxiliary_loss_clip": 0.01114903, "auxiliary_loss_mlp": 0.01045962, "balance_loss_clip": 1.02079701, "balance_loss_mlp": 1.02920556, "epoch": 0.12974597925747783, "flos": 28328281326720.0, "grad_norm": 2.5809651804015252, "language_loss": 0.80159575, "learning_rate": 3.836393413985639e-06, "loss": 0.8232044, "num_input_tokens_seen": 46617355, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.85546875, "step": 2158, "time_per_iteration": 2.4271130561828613 }, { "auxiliary_loss_clip": 0.01119188, "auxiliary_loss_mlp": 0.01044262, "balance_loss_clip": 1.02031255, "balance_loss_mlp": 1.03189647, "epoch": 0.1298061025101458, "flos": 9682357288320.0, "grad_norm": 2.3233885540699752, "language_loss": 0.74530011, "learning_rate": 3.836243716448315e-06, "loss": 0.76693463, "num_input_tokens_seen": 46633130, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.875, "step": 2159, "time_per_iteration": 2.3568336963653564 }, { "auxiliary_loss_clip": 0.01110925, "auxiliary_loss_mlp": 0.01043432, "balance_loss_clip": 1.0205555, "balance_loss_mlp": 1.02931619, "epoch": 0.12986622576281376, "flos": 27708293669760.0, "grad_norm": 1.9939973787459886, "language_loss": 0.82547617, "learning_rate": 3.8360939533801755e-06, "loss": 0.84701967, "num_input_tokens_seen": 46650575, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.81640625, "step": 2160, "time_per_iteration": 2.436048984527588 }, { "auxiliary_loss_clip": 0.01111275, "auxiliary_loss_mlp": 0.01039301, "balance_loss_clip": 1.01673508, "balance_loss_mlp": 1.03352499, "epoch": 0.12992634901548175, "flos": 18801845637120.0, "grad_norm": 1.6115067908069767, "language_loss": 0.8194257, "learning_rate": 3.835944124786566e-06, "loss": 0.84093148, "num_input_tokens_seen": 46668780, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.77734375, "step": 2161, "time_per_iteration": 2.3671443462371826 }, { "auxiliary_loss_clip": 0.0111232, "auxiliary_loss_mlp": 0.01045213, "balance_loss_clip": 1.02225292, "balance_loss_mlp": 1.03120661, "epoch": 0.12998647226814972, "flos": 29126430984960.0, "grad_norm": 2.48374558455811, "language_loss": 0.82309949, "learning_rate": 3.835794230672833e-06, "loss": 0.84467483, "num_input_tokens_seen": 46687550, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.8125, "step": 2162, "time_per_iteration": 2.451164722442627 }, { "auxiliary_loss_clip": 0.01114268, "auxiliary_loss_mlp": 0.01039767, "balance_loss_clip": 1.01642549, "balance_loss_mlp": 1.02928841, "epoch": 0.13004659552081768, "flos": 19572657834240.0, "grad_norm": 2.722292118622819, "language_loss": 0.73022962, "learning_rate": 3.8356442710443264e-06, "loss": 0.75177002, "num_input_tokens_seen": 46706730, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.8515625, "step": 2163, "time_per_iteration": 2.3654446601867676 }, { "auxiliary_loss_clip": 0.01118636, "auxiliary_loss_mlp": 0.01045333, "balance_loss_clip": 1.0205729, "balance_loss_mlp": 1.03252673, "epoch": 0.13010671877348565, "flos": 22489651261440.0, "grad_norm": 2.095764827754389, "language_loss": 0.80722785, "learning_rate": 3.835494245906398e-06, "loss": 0.82886755, "num_input_tokens_seen": 46724250, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.859375, "step": 2164, "time_per_iteration": 2.3870291709899902 }, { "auxiliary_loss_clip": 0.01114403, "auxiliary_loss_mlp": 0.01043411, "balance_loss_clip": 1.02221584, "balance_loss_mlp": 1.03097296, "epoch": 0.1301668420261536, "flos": 23877099624960.0, "grad_norm": 3.8116589712356306, "language_loss": 0.72372723, "learning_rate": 3.835344155264401e-06, "loss": 0.74530542, "num_input_tokens_seen": 46744105, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.83203125, "step": 2165, "time_per_iteration": 2.4053943157196045 }, { "auxiliary_loss_clip": 0.01116581, "auxiliary_loss_mlp": 0.0104437, "balance_loss_clip": 1.02080274, "balance_loss_mlp": 1.03170514, "epoch": 0.13022696527882158, "flos": 23148916064640.0, "grad_norm": 1.9997241389468778, "language_loss": 0.74730682, "learning_rate": 3.835193999123692e-06, "loss": 0.76891643, "num_input_tokens_seen": 46764250, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.84765625, "step": 2166, "time_per_iteration": 2.3938140869140625 }, { "auxiliary_loss_clip": 0.01114391, "auxiliary_loss_mlp": 0.01047195, "balance_loss_clip": 1.02433074, "balance_loss_mlp": 1.03019023, "epoch": 0.13028708853148954, "flos": 26907281280000.0, "grad_norm": 2.0620299613784137, "language_loss": 0.83216614, "learning_rate": 3.83504377748963e-06, "loss": 0.853782, "num_input_tokens_seen": 46786865, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.84375, "step": 2167, "time_per_iteration": 2.4385018348693848 }, { "auxiliary_loss_clip": 0.01115013, "auxiliary_loss_mlp": 0.01049261, "balance_loss_clip": 1.02614653, "balance_loss_mlp": 1.03209972, "epoch": 0.13034721178415754, "flos": 21250409086080.0, "grad_norm": 1.513359311460835, "language_loss": 0.8302232, "learning_rate": 3.834893490367576e-06, "loss": 0.85186589, "num_input_tokens_seen": 46807030, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.828125, "step": 2168, "time_per_iteration": 2.40877628326416 }, { "auxiliary_loss_clip": 0.0111569, "auxiliary_loss_mlp": 0.01046431, "balance_loss_clip": 1.02275646, "balance_loss_mlp": 1.0308814, "epoch": 0.1304073350368255, "flos": 18766338917760.0, "grad_norm": 1.984090395510942, "language_loss": 0.80360681, "learning_rate": 3.834743137762894e-06, "loss": 0.82522798, "num_input_tokens_seen": 46826280, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.84765625, "step": 2169, "time_per_iteration": 2.3826045989990234 }, { "auxiliary_loss_clip": 0.0111662, "auxiliary_loss_mlp": 0.01039846, "balance_loss_clip": 1.01742303, "balance_loss_mlp": 1.03232527, "epoch": 0.13046745828949347, "flos": 28363438932480.0, "grad_norm": 2.2305857081038893, "language_loss": 0.6652239, "learning_rate": 3.834592719680948e-06, "loss": 0.68678856, "num_input_tokens_seen": 46846505, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.84375, "step": 2170, "time_per_iteration": 2.461083173751831 }, { "auxiliary_loss_clip": 0.01114168, "auxiliary_loss_mlp": 0.01043086, "balance_loss_clip": 1.01999474, "balance_loss_mlp": 1.03133297, "epoch": 0.13052758154216143, "flos": 29603798271360.0, "grad_norm": 1.7907177832851473, "language_loss": 0.66911954, "learning_rate": 3.834442236127107e-06, "loss": 0.69069207, "num_input_tokens_seen": 46867380, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.828125, "step": 2171, "time_per_iteration": 2.4362990856170654 }, { "auxiliary_loss_clip": 0.01113873, "auxiliary_loss_mlp": 0.0104092, "balance_loss_clip": 1.01773369, "balance_loss_mlp": 1.03114748, "epoch": 0.1305877047948294, "flos": 19389852622080.0, "grad_norm": 3.7675128109353753, "language_loss": 0.71806937, "learning_rate": 3.834291687106741e-06, "loss": 0.73961735, "num_input_tokens_seen": 46886810, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.828125, "step": 2172, "time_per_iteration": 2.380793333053589 }, { "auxiliary_loss_clip": 0.01112477, "auxiliary_loss_mlp": 0.01036177, "balance_loss_clip": 1.0148859, "balance_loss_mlp": 1.03321958, "epoch": 0.13064782804749736, "flos": 16872579884160.0, "grad_norm": 2.3902403008806186, "language_loss": 0.75815773, "learning_rate": 3.834141072625224e-06, "loss": 0.77964425, "num_input_tokens_seen": 46905620, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.79296875, "step": 2173, "time_per_iteration": 2.3549892902374268 }, { "auxiliary_loss_clip": 0.01115196, "auxiliary_loss_mlp": 0.0105006, "balance_loss_clip": 1.0247401, "balance_loss_mlp": 1.03040016, "epoch": 0.13070795130016533, "flos": 24497925154560.0, "grad_norm": 2.9225325642837494, "language_loss": 0.70756316, "learning_rate": 3.833990392687929e-06, "loss": 0.72921574, "num_input_tokens_seen": 46925120, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.84765625, "step": 2174, "time_per_iteration": 2.4198758602142334 }, { "auxiliary_loss_clip": 0.01028906, "auxiliary_loss_mlp": 0.01013085, "balance_loss_clip": 1.00998521, "balance_loss_mlp": 1.00515437, "epoch": 0.13076807455283332, "flos": 71051042211840.0, "grad_norm": 0.7942536445789119, "language_loss": 0.59030503, "learning_rate": 3.833839647300235e-06, "loss": 0.61072493, "num_input_tokens_seen": 46988195, "router_z_loss_clip": 0.03088379, "router_z_loss_mlp": 0.23828125, "step": 2175, "time_per_iteration": 3.144157648086548 }, { "auxiliary_loss_clip": 0.01114359, "auxiliary_loss_mlp": 0.01043894, "balance_loss_clip": 1.02068412, "balance_loss_mlp": 1.03204215, "epoch": 0.13082819780550128, "flos": 20263519284480.0, "grad_norm": 2.0487518159718525, "language_loss": 0.79935825, "learning_rate": 3.8336888364675215e-06, "loss": 0.82094073, "num_input_tokens_seen": 47004720, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.8203125, "step": 2176, "time_per_iteration": 2.3896164894104004 }, { "auxiliary_loss_clip": 0.01114783, "auxiliary_loss_mlp": 0.0104714, "balance_loss_clip": 1.02339339, "balance_loss_mlp": 1.03182983, "epoch": 0.13088832105816925, "flos": 34202034086400.0, "grad_norm": 1.8814432995606216, "language_loss": 0.74356544, "learning_rate": 3.83353796019517e-06, "loss": 0.76518464, "num_input_tokens_seen": 47024255, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.828125, "step": 2177, "time_per_iteration": 2.495523691177368 }, { "auxiliary_loss_clip": 0.01109811, "auxiliary_loss_mlp": 0.01038365, "balance_loss_clip": 1.0149163, "balance_loss_mlp": 1.02974164, "epoch": 0.13094844431083721, "flos": 17893998887040.0, "grad_norm": 3.209198226754352, "language_loss": 0.81699485, "learning_rate": 3.833387018488565e-06, "loss": 0.8384766, "num_input_tokens_seen": 47042465, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.80078125, "step": 2178, "time_per_iteration": 2.3795485496520996 }, { "auxiliary_loss_clip": 0.01114659, "auxiliary_loss_mlp": 0.01045616, "balance_loss_clip": 1.02328825, "balance_loss_mlp": 1.03250599, "epoch": 0.13100856756350518, "flos": 17310355822080.0, "grad_norm": 2.5560615218726506, "language_loss": 0.74238646, "learning_rate": 3.833236011353094e-06, "loss": 0.76398921, "num_input_tokens_seen": 47060370, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.8203125, "step": 2179, "time_per_iteration": 2.342864751815796 }, { "auxiliary_loss_clip": 0.01110019, "auxiliary_loss_mlp": 0.01040483, "balance_loss_clip": 1.0178926, "balance_loss_mlp": 1.03046405, "epoch": 0.13106869081617314, "flos": 22199453107200.0, "grad_norm": 2.048661509886946, "language_loss": 0.84644121, "learning_rate": 3.833084938794144e-06, "loss": 0.86794627, "num_input_tokens_seen": 47081415, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.79296875, "step": 2180, "time_per_iteration": 2.407223701477051 }, { "auxiliary_loss_clip": 0.01112943, "auxiliary_loss_mlp": 0.01047561, "balance_loss_clip": 1.02467299, "balance_loss_mlp": 1.0332936, "epoch": 0.13112881406884114, "flos": 21762026282880.0, "grad_norm": 1.9753502228991404, "language_loss": 0.89866793, "learning_rate": 3.832933800817109e-06, "loss": 0.92027295, "num_input_tokens_seen": 47099860, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.796875, "step": 2181, "time_per_iteration": 2.3756368160247803 }, { "auxiliary_loss_clip": 0.01114895, "auxiliary_loss_mlp": 0.01039846, "balance_loss_clip": 1.01674366, "balance_loss_mlp": 1.03142238, "epoch": 0.1311889373215091, "flos": 23329975708800.0, "grad_norm": 1.9743377326784572, "language_loss": 0.68522978, "learning_rate": 3.832782597427381e-06, "loss": 0.70677722, "num_input_tokens_seen": 47118540, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.8359375, "step": 2182, "time_per_iteration": 2.4033896923065186 }, { "auxiliary_loss_clip": 0.01112122, "auxiliary_loss_mlp": 0.01042636, "balance_loss_clip": 1.01881838, "balance_loss_mlp": 1.03059769, "epoch": 0.13124906057417707, "flos": 21466381956480.0, "grad_norm": 2.201104097311233, "language_loss": 0.78646314, "learning_rate": 3.832631328630357e-06, "loss": 0.8080107, "num_input_tokens_seen": 47136710, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.81640625, "step": 2183, "time_per_iteration": 2.3812484741210938 }, { "auxiliary_loss_clip": 0.01112053, "auxiliary_loss_mlp": 0.01041131, "balance_loss_clip": 1.01770616, "balance_loss_mlp": 1.03083646, "epoch": 0.13130918382684503, "flos": 23254284147840.0, "grad_norm": 1.775200535092224, "language_loss": 0.85511321, "learning_rate": 3.832479994431435e-06, "loss": 0.87664509, "num_input_tokens_seen": 47157155, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.8125, "step": 2184, "time_per_iteration": 2.4094576835632324 }, { "auxiliary_loss_clip": 0.01112983, "auxiliary_loss_mlp": 0.01048421, "balance_loss_clip": 1.02326775, "balance_loss_mlp": 1.02956557, "epoch": 0.131369307079513, "flos": 20849222119680.0, "grad_norm": 1.8957297581875063, "language_loss": 0.81803644, "learning_rate": 3.8323285948360155e-06, "loss": 0.83965051, "num_input_tokens_seen": 47176820, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 0.83203125, "step": 2185, "time_per_iteration": 2.3804616928100586 }, { "auxiliary_loss_clip": 0.01114125, "auxiliary_loss_mlp": 0.01049463, "balance_loss_clip": 1.02526331, "balance_loss_mlp": 1.02938843, "epoch": 0.13142943033218096, "flos": 17857375004160.0, "grad_norm": 2.2495711618574887, "language_loss": 0.73018312, "learning_rate": 3.832177129849501e-06, "loss": 0.75181901, "num_input_tokens_seen": 47195855, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.84765625, "step": 2186, "time_per_iteration": 2.3832266330718994 }, { "auxiliary_loss_clip": 0.01111766, "auxiliary_loss_mlp": 0.01042853, "balance_loss_clip": 1.01833165, "balance_loss_mlp": 1.03020239, "epoch": 0.13148955358484893, "flos": 20994984512640.0, "grad_norm": 1.9049824607495724, "language_loss": 0.79982936, "learning_rate": 3.832025599477299e-06, "loss": 0.82137549, "num_input_tokens_seen": 47214535, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.81640625, "step": 2187, "time_per_iteration": 3.7775983810424805 }, { "auxiliary_loss_clip": 0.01029256, "auxiliary_loss_mlp": 0.01001701, "balance_loss_clip": 0.99886429, "balance_loss_mlp": 1.00545847, "epoch": 0.13154967683751692, "flos": 70169206291200.0, "grad_norm": 0.8418638121387123, "language_loss": 0.59020334, "learning_rate": 3.831874003724815e-06, "loss": 0.61051291, "num_input_tokens_seen": 47270300, "router_z_loss_clip": 0.02832031, "router_z_loss_mlp": 0.23828125, "step": 2188, "time_per_iteration": 3.027845859527588 }, { "auxiliary_loss_clip": 0.01113306, "auxiliary_loss_mlp": 0.0104065, "balance_loss_clip": 1.0176661, "balance_loss_mlp": 1.03258336, "epoch": 0.1316098000901849, "flos": 20375101589760.0, "grad_norm": 2.7635906312789493, "language_loss": 0.74211311, "learning_rate": 3.83172234259746e-06, "loss": 0.76365268, "num_input_tokens_seen": 47290720, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.80859375, "step": 2189, "time_per_iteration": 2.404848575592041 }, { "auxiliary_loss_clip": 0.01111108, "auxiliary_loss_mlp": 0.01043428, "balance_loss_clip": 1.02089787, "balance_loss_mlp": 1.03086686, "epoch": 0.13166992334285285, "flos": 23220034237440.0, "grad_norm": 2.841583407847499, "language_loss": 0.72652352, "learning_rate": 3.831570616100646e-06, "loss": 0.74806881, "num_input_tokens_seen": 47311820, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.80078125, "step": 2190, "time_per_iteration": 3.9414379596710205 }, { "auxiliary_loss_clip": 0.01114807, "auxiliary_loss_mlp": 0.01043525, "balance_loss_clip": 1.02108979, "balance_loss_mlp": 1.03255475, "epoch": 0.13173004659552082, "flos": 23329836063360.0, "grad_norm": 2.0700281427836646, "language_loss": 0.74798489, "learning_rate": 3.831418824239789e-06, "loss": 0.7695682, "num_input_tokens_seen": 47331605, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.82421875, "step": 2191, "time_per_iteration": 2.4002647399902344 }, { "auxiliary_loss_clip": 0.01113795, "auxiliary_loss_mlp": 0.01042197, "balance_loss_clip": 1.0187366, "balance_loss_mlp": 1.03179884, "epoch": 0.13179016984818878, "flos": 21250443997440.0, "grad_norm": 1.8918926065518962, "language_loss": 0.79094386, "learning_rate": 3.831266967020304e-06, "loss": 0.81250381, "num_input_tokens_seen": 47350455, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.8203125, "step": 2192, "time_per_iteration": 3.7655372619628906 }, { "auxiliary_loss_clip": 0.01113748, "auxiliary_loss_mlp": 0.01049621, "balance_loss_clip": 1.02704287, "balance_loss_mlp": 1.03156579, "epoch": 0.13185029310085675, "flos": 17777913016320.0, "grad_norm": 1.8315977299639683, "language_loss": 0.85026896, "learning_rate": 3.831115044447613e-06, "loss": 0.87190259, "num_input_tokens_seen": 47368225, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.8203125, "step": 2193, "time_per_iteration": 2.3712708950042725 }, { "auxiliary_loss_clip": 0.01027596, "auxiliary_loss_mlp": 0.01006696, "balance_loss_clip": 1.00401342, "balance_loss_mlp": 1.00445843, "epoch": 0.1319104163535247, "flos": 69848319185280.0, "grad_norm": 0.7516439486959761, "language_loss": 0.5407998, "learning_rate": 3.830963056527136e-06, "loss": 0.56114268, "num_input_tokens_seen": 47427125, "router_z_loss_clip": 0.02685547, "router_z_loss_mlp": 0.23242188, "step": 2194, "time_per_iteration": 2.982822895050049 }, { "auxiliary_loss_clip": 0.01111447, "auxiliary_loss_mlp": 0.0104063, "balance_loss_clip": 1.0185287, "balance_loss_mlp": 1.02902436, "epoch": 0.1319705396061927, "flos": 25191893715840.0, "grad_norm": 2.706274089240783, "language_loss": 0.72682089, "learning_rate": 3.830811003264296e-06, "loss": 0.74834168, "num_input_tokens_seen": 47450275, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.82421875, "step": 2195, "time_per_iteration": 2.4383349418640137 }, { "auxiliary_loss_clip": 0.01116518, "auxiliary_loss_mlp": 0.01036477, "balance_loss_clip": 1.01201487, "balance_loss_mlp": 1.03085577, "epoch": 0.13203066285886067, "flos": 20739420293760.0, "grad_norm": 2.188057333145781, "language_loss": 0.77745765, "learning_rate": 3.830658884664522e-06, "loss": 0.79898763, "num_input_tokens_seen": 47469155, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.859375, "step": 2196, "time_per_iteration": 2.4020564556121826 }, { "auxiliary_loss_clip": 0.01112904, "auxiliary_loss_mlp": 0.01046155, "balance_loss_clip": 1.02268314, "balance_loss_mlp": 1.03074789, "epoch": 0.13209078611152864, "flos": 22053306689280.0, "grad_norm": 2.3314154498043687, "language_loss": 0.74964809, "learning_rate": 3.830506700733241e-06, "loss": 0.77123868, "num_input_tokens_seen": 47488405, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.8203125, "step": 2197, "time_per_iteration": 2.3787591457366943 }, { "auxiliary_loss_clip": 0.01112553, "auxiliary_loss_mlp": 0.01036321, "balance_loss_clip": 1.01301527, "balance_loss_mlp": 1.03028679, "epoch": 0.1321509093641966, "flos": 16284153962880.0, "grad_norm": 1.9713334442069481, "language_loss": 0.79398841, "learning_rate": 3.830354451475884e-06, "loss": 0.81547713, "num_input_tokens_seen": 47505650, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.8203125, "step": 2198, "time_per_iteration": 2.3466055393218994 }, { "auxiliary_loss_clip": 0.01110413, "auxiliary_loss_mlp": 0.01045317, "balance_loss_clip": 1.02369249, "balance_loss_mlp": 1.0311594, "epoch": 0.13221103261686457, "flos": 16982067507840.0, "grad_norm": 2.16452769131162, "language_loss": 0.82794964, "learning_rate": 3.830202136897886e-06, "loss": 0.84950697, "num_input_tokens_seen": 47521540, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.79296875, "step": 2199, "time_per_iteration": 2.3588204383850098 }, { "auxiliary_loss_clip": 0.01114966, "auxiliary_loss_mlp": 0.01045151, "balance_loss_clip": 1.02260888, "balance_loss_mlp": 1.03232229, "epoch": 0.13227115586953253, "flos": 34232373924480.0, "grad_norm": 2.0674267205343058, "language_loss": 0.69267744, "learning_rate": 3.8300497570046804e-06, "loss": 0.71427858, "num_input_tokens_seen": 47543625, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.82421875, "step": 2200, "time_per_iteration": 2.498288631439209 }, { "auxiliary_loss_clip": 0.01110224, "auxiliary_loss_mlp": 0.01044977, "balance_loss_clip": 1.02214885, "balance_loss_mlp": 1.02931237, "epoch": 0.13233127912220052, "flos": 20703599372160.0, "grad_norm": 1.72394146433636, "language_loss": 0.84412003, "learning_rate": 3.829897311801707e-06, "loss": 0.86567205, "num_input_tokens_seen": 47563740, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.80859375, "step": 2201, "time_per_iteration": 2.41595721244812 }, { "auxiliary_loss_clip": 0.0111353, "auxiliary_loss_mlp": 0.0103892, "balance_loss_clip": 1.01606727, "balance_loss_mlp": 1.03146267, "epoch": 0.1323914023748685, "flos": 25804061228160.0, "grad_norm": 1.8939885495026298, "language_loss": 0.8684684, "learning_rate": 3.829744801294406e-06, "loss": 0.88999289, "num_input_tokens_seen": 47582655, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.8203125, "step": 2202, "time_per_iteration": 2.4088144302368164 }, { "auxiliary_loss_clip": 0.01108057, "auxiliary_loss_mlp": 0.01042929, "balance_loss_clip": 1.02104235, "balance_loss_mlp": 1.02929723, "epoch": 0.13245152562753645, "flos": 21250478908800.0, "grad_norm": 1.9619794150131111, "language_loss": 0.72687638, "learning_rate": 3.8295922254882186e-06, "loss": 0.74838626, "num_input_tokens_seen": 47600875, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.7890625, "step": 2203, "time_per_iteration": 2.4206366539001465 }, { "auxiliary_loss_clip": 0.011129, "auxiliary_loss_mlp": 0.01045435, "balance_loss_clip": 1.02400172, "balance_loss_mlp": 1.03066564, "epoch": 0.13251164888020442, "flos": 26609856474240.0, "grad_norm": 2.3983479091674726, "language_loss": 0.73204589, "learning_rate": 3.829439584388591e-06, "loss": 0.75362927, "num_input_tokens_seen": 47619250, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.82421875, "step": 2204, "time_per_iteration": 2.413806676864624 }, { "auxiliary_loss_clip": 0.01114894, "auxiliary_loss_mlp": 0.01049636, "balance_loss_clip": 1.02656865, "balance_loss_mlp": 1.03270948, "epoch": 0.13257177213287238, "flos": 29825217313920.0, "grad_norm": 1.627487404452735, "language_loss": 0.78527379, "learning_rate": 3.8292868780009715e-06, "loss": 0.8069191, "num_input_tokens_seen": 47639445, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.8203125, "step": 2205, "time_per_iteration": 2.448495626449585 }, { "auxiliary_loss_clip": 0.01112746, "auxiliary_loss_mlp": 0.01042662, "balance_loss_clip": 1.01917768, "balance_loss_mlp": 1.03346515, "epoch": 0.13263189538554035, "flos": 21287382082560.0, "grad_norm": 3.000510399921291, "language_loss": 0.78886485, "learning_rate": 3.829134106330809e-06, "loss": 0.81041896, "num_input_tokens_seen": 47658740, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.79296875, "step": 2206, "time_per_iteration": 2.371365785598755 }, { "auxiliary_loss_clip": 0.01112996, "auxiliary_loss_mlp": 0.0104297, "balance_loss_clip": 1.02140486, "balance_loss_mlp": 1.03161049, "epoch": 0.13269201863820831, "flos": 16873138465920.0, "grad_norm": 1.9451013166631212, "language_loss": 0.74432611, "learning_rate": 3.828981269383554e-06, "loss": 0.76588583, "num_input_tokens_seen": 47676880, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.8125, "step": 2207, "time_per_iteration": 2.374790668487549 }, { "auxiliary_loss_clip": 0.01109145, "auxiliary_loss_mlp": 0.01041137, "balance_loss_clip": 1.01826119, "balance_loss_mlp": 1.02947807, "epoch": 0.1327521418908763, "flos": 23767786558080.0, "grad_norm": 1.6835914547074657, "language_loss": 0.8392238, "learning_rate": 3.828828367164663e-06, "loss": 0.86072659, "num_input_tokens_seen": 47696635, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.796875, "step": 2208, "time_per_iteration": 2.4047040939331055 }, { "auxiliary_loss_clip": 0.01109737, "auxiliary_loss_mlp": 0.010438, "balance_loss_clip": 1.02266455, "balance_loss_mlp": 1.03286314, "epoch": 0.13281226514354427, "flos": 26504383656960.0, "grad_norm": 1.640844661454858, "language_loss": 0.84896123, "learning_rate": 3.828675399679592e-06, "loss": 0.87049663, "num_input_tokens_seen": 47717760, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.76953125, "step": 2209, "time_per_iteration": 2.446331024169922 }, { "auxiliary_loss_clip": 0.01111576, "auxiliary_loss_mlp": 0.01041452, "balance_loss_clip": 1.02059054, "balance_loss_mlp": 1.02965975, "epoch": 0.13287238839621224, "flos": 24497610952320.0, "grad_norm": 3.8266139058343094, "language_loss": 0.82185507, "learning_rate": 3.8285223669337995e-06, "loss": 0.84338534, "num_input_tokens_seen": 47737685, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.8203125, "step": 2210, "time_per_iteration": 2.407039165496826 }, { "auxiliary_loss_clip": 0.01028667, "auxiliary_loss_mlp": 0.01008429, "balance_loss_clip": 1.00596142, "balance_loss_mlp": 1.0048151, "epoch": 0.1329325116488802, "flos": 67691071054080.0, "grad_norm": 0.7631754503202972, "language_loss": 0.57968318, "learning_rate": 3.828369268932747e-06, "loss": 0.60005414, "num_input_tokens_seen": 47802415, "router_z_loss_clip": 0.0246582, "router_z_loss_mlp": 0.23828125, "step": 2211, "time_per_iteration": 3.1298675537109375 }, { "auxiliary_loss_clip": 0.01027464, "auxiliary_loss_mlp": 0.01003969, "balance_loss_clip": 1.00142968, "balance_loss_mlp": 1.00369525, "epoch": 0.13299263490154817, "flos": 72258303715200.0, "grad_norm": 0.7972035907439725, "language_loss": 0.55318034, "learning_rate": 3.828216105681899e-06, "loss": 0.57349467, "num_input_tokens_seen": 47871485, "router_z_loss_clip": 0.02539062, "router_z_loss_mlp": 0.23828125, "step": 2212, "time_per_iteration": 3.120358943939209 }, { "auxiliary_loss_clip": 0.01115866, "auxiliary_loss_mlp": 0.01043653, "balance_loss_clip": 1.02024066, "balance_loss_mlp": 1.03086209, "epoch": 0.13305275815421613, "flos": 17930308567680.0, "grad_norm": 3.686084098762373, "language_loss": 0.74999905, "learning_rate": 3.8280628771867205e-06, "loss": 0.77159429, "num_input_tokens_seen": 47888315, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.8515625, "step": 2213, "time_per_iteration": 2.3642704486846924 }, { "auxiliary_loss_clip": 0.01106028, "auxiliary_loss_mlp": 0.0103986, "balance_loss_clip": 1.01936781, "balance_loss_mlp": 1.02920556, "epoch": 0.13311288140688413, "flos": 22339943884800.0, "grad_norm": 1.9918070964647272, "language_loss": 0.79267049, "learning_rate": 3.8279095834526815e-06, "loss": 0.81412941, "num_input_tokens_seen": 47906600, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.765625, "step": 2214, "time_per_iteration": 2.381934881210327 }, { "auxiliary_loss_clip": 0.01111644, "auxiliary_loss_mlp": 0.01048696, "balance_loss_clip": 1.02578413, "balance_loss_mlp": 1.03067029, "epoch": 0.1331730046595521, "flos": 31867531649280.0, "grad_norm": 1.8632679071922624, "language_loss": 0.69134682, "learning_rate": 3.8277562244852495e-06, "loss": 0.71295023, "num_input_tokens_seen": 47927630, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.8125, "step": 2215, "time_per_iteration": 2.469895601272583 }, { "auxiliary_loss_clip": 0.0110936, "auxiliary_loss_mlp": 0.01038844, "balance_loss_clip": 1.01729095, "balance_loss_mlp": 1.0285362, "epoch": 0.13323312791222006, "flos": 22565447556480.0, "grad_norm": 1.7429587827665565, "language_loss": 0.8103472, "learning_rate": 3.827602800289901e-06, "loss": 0.83182919, "num_input_tokens_seen": 47947935, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.8046875, "step": 2216, "time_per_iteration": 2.4177145957946777 }, { "auxiliary_loss_clip": 0.01110098, "auxiliary_loss_mlp": 0.01050275, "balance_loss_clip": 1.02679074, "balance_loss_mlp": 1.02934813, "epoch": 0.13329325116488802, "flos": 15084433313280.0, "grad_norm": 1.9884474111186918, "language_loss": 0.87180638, "learning_rate": 3.827449310872109e-06, "loss": 0.89341009, "num_input_tokens_seen": 47965515, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.8046875, "step": 2217, "time_per_iteration": 2.3541815280914307 }, { "auxiliary_loss_clip": 0.01110352, "auxiliary_loss_mlp": 0.01042972, "balance_loss_clip": 1.02001226, "balance_loss_mlp": 1.03001714, "epoch": 0.133353374417556, "flos": 27452450160000.0, "grad_norm": 2.0869599672048142, "language_loss": 0.73178005, "learning_rate": 3.827295756237351e-06, "loss": 0.75331324, "num_input_tokens_seen": 47985675, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.8046875, "step": 2218, "time_per_iteration": 2.4269580841064453 }, { "auxiliary_loss_clip": 0.01110731, "auxiliary_loss_mlp": 0.01040425, "balance_loss_clip": 1.0173105, "balance_loss_mlp": 1.03010273, "epoch": 0.13341349767022395, "flos": 24093631077120.0, "grad_norm": 1.9308602690623262, "language_loss": 0.87036943, "learning_rate": 3.8271421363911095e-06, "loss": 0.89188099, "num_input_tokens_seen": 48004985, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.8046875, "step": 2219, "time_per_iteration": 2.412137985229492 }, { "auxiliary_loss_clip": 0.01112182, "auxiliary_loss_mlp": 0.01044723, "balance_loss_clip": 1.02320623, "balance_loss_mlp": 1.03274846, "epoch": 0.13347362092289192, "flos": 24132209996160.0, "grad_norm": 1.8041370165092492, "language_loss": 0.77078104, "learning_rate": 3.826988451338864e-06, "loss": 0.79235017, "num_input_tokens_seen": 48024965, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.79296875, "step": 2220, "time_per_iteration": 2.3967785835266113 }, { "auxiliary_loss_clip": 0.01107377, "auxiliary_loss_mlp": 0.01038252, "balance_loss_clip": 1.01708031, "balance_loss_mlp": 1.02819431, "epoch": 0.1335337441755599, "flos": 18435711542400.0, "grad_norm": 7.832869738206291, "language_loss": 0.78862309, "learning_rate": 3.826834701086101e-06, "loss": 0.8100794, "num_input_tokens_seen": 48040890, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.7890625, "step": 2221, "time_per_iteration": 2.364459753036499 }, { "auxiliary_loss_clip": 0.01027571, "auxiliary_loss_mlp": 0.01008219, "balance_loss_clip": 1.00550091, "balance_loss_mlp": 1.00492895, "epoch": 0.13359386742822787, "flos": 50609395837440.0, "grad_norm": 1.0235370816867682, "language_loss": 0.69041914, "learning_rate": 3.826680885638306e-06, "loss": 0.71077704, "num_input_tokens_seen": 48091855, "router_z_loss_clip": 0.02722168, "router_z_loss_mlp": 0.2265625, "step": 2222, "time_per_iteration": 2.852614641189575 }, { "auxiliary_loss_clip": 0.01111476, "auxiliary_loss_mlp": 0.01043529, "balance_loss_clip": 1.02135563, "balance_loss_mlp": 1.03144515, "epoch": 0.13365399068089584, "flos": 22777615088640.0, "grad_norm": 2.5645377689459323, "language_loss": 0.67273825, "learning_rate": 3.826527005000969e-06, "loss": 0.69428831, "num_input_tokens_seen": 48111350, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.80078125, "step": 2223, "time_per_iteration": 2.4104299545288086 }, { "auxiliary_loss_clip": 0.01111142, "auxiliary_loss_mlp": 0.01041346, "balance_loss_clip": 1.01845801, "balance_loss_mlp": 1.02990484, "epoch": 0.1337141139335638, "flos": 12530781072000.0, "grad_norm": 2.3977038475576817, "language_loss": 0.82913315, "learning_rate": 3.826373059179582e-06, "loss": 0.85065806, "num_input_tokens_seen": 48129840, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.8125, "step": 2224, "time_per_iteration": 2.357208013534546 }, { "auxiliary_loss_clip": 0.01115536, "auxiliary_loss_mlp": 0.01043834, "balance_loss_clip": 1.01978946, "balance_loss_mlp": 1.03101516, "epoch": 0.13377423718623177, "flos": 23037857429760.0, "grad_norm": 2.4352710360103487, "language_loss": 0.6528067, "learning_rate": 3.826219048179639e-06, "loss": 0.67440045, "num_input_tokens_seen": 48149240, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.84375, "step": 2225, "time_per_iteration": 2.411724090576172 }, { "auxiliary_loss_clip": 0.01111973, "auxiliary_loss_mlp": 0.01051142, "balance_loss_clip": 1.02871895, "balance_loss_mlp": 1.03108084, "epoch": 0.13383436043889974, "flos": 16215479585280.0, "grad_norm": 2.2820423654325768, "language_loss": 0.89218378, "learning_rate": 3.826064972006635e-06, "loss": 0.9138149, "num_input_tokens_seen": 48166330, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.80859375, "step": 2226, "time_per_iteration": 2.3514645099639893 }, { "auxiliary_loss_clip": 0.01111681, "auxiliary_loss_mlp": 0.01046075, "balance_loss_clip": 1.02270985, "balance_loss_mlp": 1.03023171, "epoch": 0.1338944836915677, "flos": 24278530970880.0, "grad_norm": 2.3901396881263177, "language_loss": 0.74010229, "learning_rate": 3.825910830666069e-06, "loss": 0.76167989, "num_input_tokens_seen": 48187600, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.8125, "step": 2227, "time_per_iteration": 3.7839083671569824 }, { "auxiliary_loss_clip": 0.01109886, "auxiliary_loss_mlp": 0.01042187, "balance_loss_clip": 1.02022851, "balance_loss_mlp": 1.02917624, "epoch": 0.1339546069442357, "flos": 17597900712960.0, "grad_norm": 1.9744666336955208, "language_loss": 0.85193986, "learning_rate": 3.825756624163443e-06, "loss": 0.87346053, "num_input_tokens_seen": 48204400, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.80859375, "step": 2228, "time_per_iteration": 2.4063632488250732 }, { "auxiliary_loss_clip": 0.0111183, "auxiliary_loss_mlp": 0.01043096, "balance_loss_clip": 1.02082753, "balance_loss_mlp": 1.03083122, "epoch": 0.13401473019690366, "flos": 18989049680640.0, "grad_norm": 2.210839270971356, "language_loss": 0.80781674, "learning_rate": 3.825602352504259e-06, "loss": 0.82936597, "num_input_tokens_seen": 48222180, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.80859375, "step": 2229, "time_per_iteration": 3.8466081619262695 }, { "auxiliary_loss_clip": 0.01112713, "auxiliary_loss_mlp": 0.01054344, "balance_loss_clip": 1.03212357, "balance_loss_mlp": 1.03109515, "epoch": 0.13407485344957162, "flos": 26942578531200.0, "grad_norm": 1.7333625897589784, "language_loss": 0.73866439, "learning_rate": 3.825448015694023e-06, "loss": 0.76033497, "num_input_tokens_seen": 48243245, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.81640625, "step": 2230, "time_per_iteration": 3.813413619995117 }, { "auxiliary_loss_clip": 0.01114408, "auxiliary_loss_mlp": 0.01052288, "balance_loss_clip": 1.02911401, "balance_loss_mlp": 1.03100502, "epoch": 0.1341349767022396, "flos": 20338338061440.0, "grad_norm": 1.665509430892685, "language_loss": 0.80048466, "learning_rate": 3.8252936137382435e-06, "loss": 0.8221516, "num_input_tokens_seen": 48262600, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.8359375, "step": 2231, "time_per_iteration": 3.74367356300354 }, { "auxiliary_loss_clip": 0.01113262, "auxiliary_loss_mlp": 0.01053751, "balance_loss_clip": 1.03018284, "balance_loss_mlp": 1.03137553, "epoch": 0.13419509995490755, "flos": 29860724033280.0, "grad_norm": 1.6743044726072736, "language_loss": 0.72241318, "learning_rate": 3.82513914664243e-06, "loss": 0.74408329, "num_input_tokens_seen": 48285075, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.81640625, "step": 2232, "time_per_iteration": 2.4403839111328125 }, { "auxiliary_loss_clip": 0.01116062, "auxiliary_loss_mlp": 0.01045066, "balance_loss_clip": 1.02165365, "balance_loss_mlp": 1.0313139, "epoch": 0.13425522320757552, "flos": 26941775569920.0, "grad_norm": 2.475825651129534, "language_loss": 0.65877473, "learning_rate": 3.824984614412095e-06, "loss": 0.68038601, "num_input_tokens_seen": 48301285, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.84765625, "step": 2233, "time_per_iteration": 2.420086145401001 }, { "auxiliary_loss_clip": 0.01110733, "auxiliary_loss_mlp": 0.01040684, "balance_loss_clip": 1.01764095, "balance_loss_mlp": 1.0282203, "epoch": 0.1343153464602435, "flos": 15776411927040.0, "grad_norm": 2.6796130628290333, "language_loss": 0.81137764, "learning_rate": 3.824830017052753e-06, "loss": 0.83289182, "num_input_tokens_seen": 48317835, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.82421875, "step": 2234, "time_per_iteration": 2.3355088233947754 }, { "auxiliary_loss_clip": 0.01114447, "auxiliary_loss_mlp": 0.01045909, "balance_loss_clip": 1.02279413, "balance_loss_mlp": 1.03209209, "epoch": 0.13437546971291148, "flos": 24313653665280.0, "grad_norm": 2.0611952609550626, "language_loss": 0.82459158, "learning_rate": 3.824675354569923e-06, "loss": 0.8461951, "num_input_tokens_seen": 48335670, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.82421875, "step": 2235, "time_per_iteration": 2.398684501647949 }, { "auxiliary_loss_clip": 0.01113562, "auxiliary_loss_mlp": 0.01046007, "balance_loss_clip": 1.02316618, "balance_loss_mlp": 1.03003097, "epoch": 0.13443559296557944, "flos": 26649482734080.0, "grad_norm": 1.8639521609867127, "language_loss": 0.86475575, "learning_rate": 3.824520626969122e-06, "loss": 0.88635147, "num_input_tokens_seen": 48357805, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.8359375, "step": 2236, "time_per_iteration": 2.4377503395080566 }, { "auxiliary_loss_clip": 0.01115367, "auxiliary_loss_mlp": 0.01041982, "balance_loss_clip": 1.01909375, "balance_loss_mlp": 1.03230786, "epoch": 0.1344957162182474, "flos": 21795193941120.0, "grad_norm": 1.6210711622806835, "language_loss": 0.77399528, "learning_rate": 3.824365834255874e-06, "loss": 0.79556882, "num_input_tokens_seen": 48377845, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.83203125, "step": 2237, "time_per_iteration": 2.3981072902679443 }, { "auxiliary_loss_clip": 0.01115959, "auxiliary_loss_mlp": 0.01050458, "balance_loss_clip": 1.02498269, "balance_loss_mlp": 1.0319252, "epoch": 0.13455583947091537, "flos": 19864531733760.0, "grad_norm": 3.126998467904437, "language_loss": 0.78480875, "learning_rate": 3.824210976435702e-06, "loss": 0.8064729, "num_input_tokens_seen": 48394735, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 0.83984375, "step": 2238, "time_per_iteration": 2.369377851486206 }, { "auxiliary_loss_clip": 0.01108949, "auxiliary_loss_mlp": 0.01037434, "balance_loss_clip": 1.01383018, "balance_loss_mlp": 1.02933514, "epoch": 0.13461596272358334, "flos": 30845519153280.0, "grad_norm": 2.5142370982128113, "language_loss": 0.68518054, "learning_rate": 3.824056053514132e-06, "loss": 0.70664436, "num_input_tokens_seen": 48414200, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.796875, "step": 2239, "time_per_iteration": 2.4480724334716797 }, { "auxiliary_loss_clip": 0.01115627, "auxiliary_loss_mlp": 0.01046975, "balance_loss_clip": 1.02345467, "balance_loss_mlp": 1.0310353, "epoch": 0.1346760859762513, "flos": 12493633518720.0, "grad_norm": 2.4649535556244233, "language_loss": 0.8140105, "learning_rate": 3.823901065496693e-06, "loss": 0.8356365, "num_input_tokens_seen": 48431065, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.84765625, "step": 2240, "time_per_iteration": 2.3443541526794434 }, { "auxiliary_loss_clip": 0.0111462, "auxiliary_loss_mlp": 0.01047804, "balance_loss_clip": 1.02402151, "balance_loss_mlp": 1.03058279, "epoch": 0.1347362092289193, "flos": 21834924935040.0, "grad_norm": 1.6357289943000772, "language_loss": 0.77624297, "learning_rate": 3.823746012388918e-06, "loss": 0.79786718, "num_input_tokens_seen": 48450335, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.83984375, "step": 2241, "time_per_iteration": 2.386347532272339 }, { "auxiliary_loss_clip": 0.01109833, "auxiliary_loss_mlp": 0.0104163, "balance_loss_clip": 1.0201124, "balance_loss_mlp": 1.03068209, "epoch": 0.13479633248158726, "flos": 23508451912320.0, "grad_norm": 1.659205909924718, "language_loss": 0.82998061, "learning_rate": 3.823590894196339e-06, "loss": 0.85149527, "num_input_tokens_seen": 48468555, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.7890625, "step": 2242, "time_per_iteration": 2.4124395847320557 }, { "auxiliary_loss_clip": 0.01114746, "auxiliary_loss_mlp": 0.0104936, "balance_loss_clip": 1.02486253, "balance_loss_mlp": 1.03196406, "epoch": 0.13485645573425523, "flos": 29343241728000.0, "grad_norm": 3.4559024996267147, "language_loss": 0.64423156, "learning_rate": 3.823435710924491e-06, "loss": 0.66587257, "num_input_tokens_seen": 48488515, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.828125, "step": 2243, "time_per_iteration": 2.446357250213623 }, { "auxiliary_loss_clip": 0.01107438, "auxiliary_loss_mlp": 0.0103543, "balance_loss_clip": 1.01361525, "balance_loss_mlp": 1.02859759, "epoch": 0.1349165789869232, "flos": 28035883756800.0, "grad_norm": 1.9068844032861496, "language_loss": 0.72579181, "learning_rate": 3.823280462578913e-06, "loss": 0.7472204, "num_input_tokens_seen": 48510515, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.7890625, "step": 2244, "time_per_iteration": 2.431729316711426 }, { "auxiliary_loss_clip": 0.01111567, "auxiliary_loss_mlp": 0.01045916, "balance_loss_clip": 1.02393365, "balance_loss_mlp": 1.03124416, "epoch": 0.13497670223959116, "flos": 22852713156480.0, "grad_norm": 1.5985867586427198, "language_loss": 0.85773522, "learning_rate": 3.8231251491651455e-06, "loss": 0.87931001, "num_input_tokens_seen": 48529940, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.8046875, "step": 2245, "time_per_iteration": 2.402890920639038 }, { "auxiliary_loss_clip": 0.0110876, "auxiliary_loss_mlp": 0.01037539, "balance_loss_clip": 1.01528263, "balance_loss_mlp": 1.03117847, "epoch": 0.13503682549225912, "flos": 16503757614720.0, "grad_norm": 3.0225897100770207, "language_loss": 0.7903704, "learning_rate": 3.822969770688732e-06, "loss": 0.81183338, "num_input_tokens_seen": 48548190, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.7734375, "step": 2246, "time_per_iteration": 2.375706911087036 }, { "auxiliary_loss_clip": 0.01026554, "auxiliary_loss_mlp": 0.01003031, "balance_loss_clip": 1.00014651, "balance_loss_mlp": 1.00370598, "epoch": 0.1350969487449271, "flos": 70753023912960.0, "grad_norm": 0.7460429799412394, "language_loss": 0.6049459, "learning_rate": 3.8228143271552154e-06, "loss": 0.62524176, "num_input_tokens_seen": 48613165, "router_z_loss_clip": 0.02880859, "router_z_loss_mlp": 0.22851562, "step": 2247, "time_per_iteration": 3.1216773986816406 }, { "auxiliary_loss_clip": 0.01116568, "auxiliary_loss_mlp": 0.01046591, "balance_loss_clip": 1.02258193, "balance_loss_mlp": 1.03152978, "epoch": 0.13515707199759508, "flos": 23074865337600.0, "grad_norm": 1.9198332175671928, "language_loss": 0.81013012, "learning_rate": 3.822658818570145e-06, "loss": 0.83176172, "num_input_tokens_seen": 48631705, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.8515625, "step": 2248, "time_per_iteration": 2.382345199584961 }, { "auxiliary_loss_clip": 0.01108966, "auxiliary_loss_mlp": 0.01039089, "balance_loss_clip": 1.0176791, "balance_loss_mlp": 1.03090119, "epoch": 0.13521719525026304, "flos": 23185225745280.0, "grad_norm": 1.7824571080205176, "language_loss": 0.76665759, "learning_rate": 3.822503244939069e-06, "loss": 0.78813815, "num_input_tokens_seen": 48649740, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.78125, "step": 2249, "time_per_iteration": 2.39690899848938 }, { "auxiliary_loss_clip": 0.01112266, "auxiliary_loss_mlp": 0.01047254, "balance_loss_clip": 1.02599943, "balance_loss_mlp": 1.03164351, "epoch": 0.135277318502931, "flos": 24789764142720.0, "grad_norm": 1.4487850858130753, "language_loss": 0.84145266, "learning_rate": 3.822347606267541e-06, "loss": 0.86304784, "num_input_tokens_seen": 48671565, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.8046875, "step": 2250, "time_per_iteration": 2.437000274658203 }, { "auxiliary_loss_clip": 0.01112671, "auxiliary_loss_mlp": 0.01045562, "balance_loss_clip": 1.02204239, "balance_loss_mlp": 1.03052807, "epoch": 0.13533744175559898, "flos": 21907439562240.0, "grad_norm": 2.9768659822997896, "language_loss": 0.82101446, "learning_rate": 3.8221919025611145e-06, "loss": 0.84259683, "num_input_tokens_seen": 48690425, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.8203125, "step": 2251, "time_per_iteration": 2.400810480117798 }, { "auxiliary_loss_clip": 0.01110933, "auxiliary_loss_mlp": 0.01045632, "balance_loss_clip": 1.02289891, "balance_loss_mlp": 1.02972174, "epoch": 0.13539756500826694, "flos": 21210678092160.0, "grad_norm": 1.6350101086519406, "language_loss": 0.85983527, "learning_rate": 3.822036133825346e-06, "loss": 0.88140088, "num_input_tokens_seen": 48707505, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.8125, "step": 2252, "time_per_iteration": 2.3785557746887207 }, { "auxiliary_loss_clip": 0.01026414, "auxiliary_loss_mlp": 0.01003823, "balance_loss_clip": 1.00139153, "balance_loss_mlp": 1.00329792, "epoch": 0.1354576882609349, "flos": 63238981656960.0, "grad_norm": 0.7738025226790045, "language_loss": 0.61805081, "learning_rate": 3.821880300065794e-06, "loss": 0.63835323, "num_input_tokens_seen": 48775895, "router_z_loss_clip": 0.02429199, "router_z_loss_mlp": 0.23144531, "step": 2253, "time_per_iteration": 3.105794906616211 }, { "auxiliary_loss_clip": 0.01112569, "auxiliary_loss_mlp": 0.01045554, "balance_loss_clip": 1.02329731, "balance_loss_mlp": 1.03244042, "epoch": 0.1355178115136029, "flos": 25481882401920.0, "grad_norm": 1.8286536160423945, "language_loss": 0.89134341, "learning_rate": 3.821724401288022e-06, "loss": 0.91292465, "num_input_tokens_seen": 48798370, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.8046875, "step": 2254, "time_per_iteration": 2.444282054901123 }, { "auxiliary_loss_clip": 0.01115733, "auxiliary_loss_mlp": 0.01052232, "balance_loss_clip": 1.02819943, "balance_loss_mlp": 1.02996039, "epoch": 0.13557793476627086, "flos": 21615879864960.0, "grad_norm": 1.874734162542784, "language_loss": 0.84478366, "learning_rate": 3.821568437497592e-06, "loss": 0.86646336, "num_input_tokens_seen": 48817955, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.859375, "step": 2255, "time_per_iteration": 2.3827905654907227 }, { "auxiliary_loss_clip": 0.01110767, "auxiliary_loss_mlp": 0.0104296, "balance_loss_clip": 1.02015567, "balance_loss_mlp": 1.02874386, "epoch": 0.13563805801893883, "flos": 24927322366080.0, "grad_norm": 2.5800977448637177, "language_loss": 0.74805433, "learning_rate": 3.821412408700069e-06, "loss": 0.76959157, "num_input_tokens_seen": 48836330, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.8203125, "step": 2256, "time_per_iteration": 2.392350912094116 }, { "auxiliary_loss_clip": 0.01113995, "auxiliary_loss_mlp": 0.01049828, "balance_loss_clip": 1.02698743, "balance_loss_mlp": 1.03051257, "epoch": 0.1356981812716068, "flos": 14749581663360.0, "grad_norm": 2.636399814040291, "language_loss": 0.83367229, "learning_rate": 3.821256314901023e-06, "loss": 0.8553105, "num_input_tokens_seen": 48851890, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.8359375, "step": 2257, "time_per_iteration": 2.3453638553619385 }, { "auxiliary_loss_clip": 0.01117427, "auxiliary_loss_mlp": 0.01045476, "balance_loss_clip": 1.02176523, "balance_loss_mlp": 1.03027248, "epoch": 0.13575830452427476, "flos": 11107791077760.0, "grad_norm": 2.4127163814424946, "language_loss": 0.81851101, "learning_rate": 3.821100156106024e-06, "loss": 0.84013999, "num_input_tokens_seen": 48865510, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.875, "step": 2258, "time_per_iteration": 2.3260347843170166 }, { "auxiliary_loss_clip": 0.01112168, "auxiliary_loss_mlp": 0.01047014, "balance_loss_clip": 1.02205133, "balance_loss_mlp": 1.03000212, "epoch": 0.13581842777694272, "flos": 17959531242240.0, "grad_norm": 2.4992820719003985, "language_loss": 0.82302582, "learning_rate": 3.820943932320644e-06, "loss": 0.84461761, "num_input_tokens_seen": 48882360, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 0.8203125, "step": 2259, "time_per_iteration": 2.3578600883483887 }, { "auxiliary_loss_clip": 0.01115546, "auxiliary_loss_mlp": 0.01044489, "balance_loss_clip": 1.02334094, "balance_loss_mlp": 1.03380466, "epoch": 0.1358785510296107, "flos": 22856029735680.0, "grad_norm": 1.8211233775954654, "language_loss": 0.73700893, "learning_rate": 3.82078764355046e-06, "loss": 0.75860929, "num_input_tokens_seen": 48902700, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.81640625, "step": 2260, "time_per_iteration": 2.391852378845215 }, { "auxiliary_loss_clip": 0.01109938, "auxiliary_loss_mlp": 0.01051966, "balance_loss_clip": 1.03000736, "balance_loss_mlp": 1.02989888, "epoch": 0.13593867428227868, "flos": 25738214670720.0, "grad_norm": 2.3528590007555854, "language_loss": 0.75280863, "learning_rate": 3.820631289801048e-06, "loss": 0.77442765, "num_input_tokens_seen": 48922525, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.80078125, "step": 2261, "time_per_iteration": 2.457080602645874 }, { "auxiliary_loss_clip": 0.0111298, "auxiliary_loss_mlp": 0.01039492, "balance_loss_clip": 1.01767623, "balance_loss_mlp": 1.03044224, "epoch": 0.13599879753494665, "flos": 31247858194560.0, "grad_norm": 3.7731977894452378, "language_loss": 0.63001621, "learning_rate": 3.82047487107799e-06, "loss": 0.65154088, "num_input_tokens_seen": 48942510, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.82421875, "step": 2262, "time_per_iteration": 2.46036696434021 }, { "auxiliary_loss_clip": 0.01111991, "auxiliary_loss_mlp": 0.01041925, "balance_loss_clip": 1.01963282, "balance_loss_mlp": 1.03004408, "epoch": 0.1360589207876146, "flos": 23913898064640.0, "grad_norm": 2.647698818670158, "language_loss": 0.82917178, "learning_rate": 3.820318387386865e-06, "loss": 0.85071099, "num_input_tokens_seen": 48962625, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.8203125, "step": 2263, "time_per_iteration": 2.4298808574676514 }, { "auxiliary_loss_clip": 0.01116094, "auxiliary_loss_mlp": 0.01052066, "balance_loss_clip": 1.0279026, "balance_loss_mlp": 1.03200746, "epoch": 0.13611904404028258, "flos": 19973181484800.0, "grad_norm": 2.037974726999726, "language_loss": 0.87724793, "learning_rate": 3.8201618387332605e-06, "loss": 0.8989296, "num_input_tokens_seen": 48982525, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.84375, "step": 2264, "time_per_iteration": 2.3881897926330566 }, { "auxiliary_loss_clip": 0.01116603, "auxiliary_loss_mlp": 0.01043247, "balance_loss_clip": 1.01901174, "balance_loss_mlp": 1.03215957, "epoch": 0.13617916729295054, "flos": 15339753152640.0, "grad_norm": 3.0086405021950764, "language_loss": 0.71634519, "learning_rate": 3.82000522512276e-06, "loss": 0.73794365, "num_input_tokens_seen": 48997605, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.84375, "step": 2265, "time_per_iteration": 2.362116813659668 }, { "auxiliary_loss_clip": 0.01110729, "auxiliary_loss_mlp": 0.01036144, "balance_loss_clip": 1.01580715, "balance_loss_mlp": 1.03179741, "epoch": 0.1362392905456185, "flos": 27450285655680.0, "grad_norm": 2.2099575569314935, "language_loss": 0.66132319, "learning_rate": 3.819848546560957e-06, "loss": 0.68279195, "num_input_tokens_seen": 49018535, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.7890625, "step": 2266, "time_per_iteration": 2.4455068111419678 }, { "auxiliary_loss_clip": 0.01111668, "auxiliary_loss_mlp": 0.01050643, "balance_loss_clip": 1.02792215, "balance_loss_mlp": 1.03137374, "epoch": 0.1362994137982865, "flos": 25007866606080.0, "grad_norm": 1.585383208781827, "language_loss": 0.76206291, "learning_rate": 3.819691803053439e-06, "loss": 0.78368604, "num_input_tokens_seen": 49038865, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.8046875, "step": 2267, "time_per_iteration": 3.8275153636932373 }, { "auxiliary_loss_clip": 0.01110651, "auxiliary_loss_mlp": 0.01041567, "balance_loss_clip": 1.01919103, "balance_loss_mlp": 1.02962959, "epoch": 0.13635953705095447, "flos": 20301993469440.0, "grad_norm": 2.200944711480025, "language_loss": 0.81724751, "learning_rate": 3.819534994605802e-06, "loss": 0.83876967, "num_input_tokens_seen": 49058010, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.80859375, "step": 2268, "time_per_iteration": 2.3766119480133057 }, { "auxiliary_loss_clip": 0.01109573, "auxiliary_loss_mlp": 0.01038756, "balance_loss_clip": 1.01694036, "balance_loss_mlp": 1.03026772, "epoch": 0.13641966030362243, "flos": 31357066527360.0, "grad_norm": 1.7898561510552156, "language_loss": 0.75749362, "learning_rate": 3.819378121223641e-06, "loss": 0.77897686, "num_input_tokens_seen": 49080330, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.79296875, "step": 2269, "time_per_iteration": 3.867666244506836 }, { "auxiliary_loss_clip": 0.01115323, "auxiliary_loss_mlp": 0.01037113, "balance_loss_clip": 1.01519012, "balance_loss_mlp": 1.03264654, "epoch": 0.1364797835562904, "flos": 20477257827840.0, "grad_norm": 2.098228893476109, "language_loss": 0.80965889, "learning_rate": 3.819221182912555e-06, "loss": 0.8311832, "num_input_tokens_seen": 49097035, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.828125, "step": 2270, "time_per_iteration": 5.221672773361206 }, { "auxiliary_loss_clip": 0.01115496, "auxiliary_loss_mlp": 0.01048832, "balance_loss_clip": 1.02586031, "balance_loss_mlp": 1.03060257, "epoch": 0.13653990680895836, "flos": 13077520963200.0, "grad_norm": 2.722045196078793, "language_loss": 0.75869644, "learning_rate": 3.819064179678145e-06, "loss": 0.78033966, "num_input_tokens_seen": 49113945, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.84765625, "step": 2271, "time_per_iteration": 2.361807107925415 }, { "auxiliary_loss_clip": 0.01114825, "auxiliary_loss_mlp": 0.01044332, "balance_loss_clip": 1.02134812, "balance_loss_mlp": 1.03116345, "epoch": 0.13660003006162633, "flos": 16945757827200.0, "grad_norm": 1.8230049855742485, "language_loss": 0.80149591, "learning_rate": 3.8189071115260134e-06, "loss": 0.82308745, "num_input_tokens_seen": 49132855, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.8359375, "step": 2272, "time_per_iteration": 2.3897011280059814 }, { "auxiliary_loss_clip": 0.01027994, "auxiliary_loss_mlp": 0.01020793, "balance_loss_clip": 1.01800334, "balance_loss_mlp": 1.00485504, "epoch": 0.1366601533142943, "flos": 68679357310080.0, "grad_norm": 0.6956821962683516, "language_loss": 0.60680348, "learning_rate": 3.818749978461765e-06, "loss": 0.62729138, "num_input_tokens_seen": 49198310, "router_z_loss_clip": 0.0279541, "router_z_loss_mlp": 0.23144531, "step": 2273, "time_per_iteration": 3.1359124183654785 }, { "auxiliary_loss_clip": 0.01109442, "auxiliary_loss_mlp": 0.01042235, "balance_loss_clip": 1.02008581, "balance_loss_mlp": 1.02961373, "epoch": 0.13672027656696228, "flos": 19243252356480.0, "grad_norm": 1.6551537076379452, "language_loss": 0.77221978, "learning_rate": 3.8185927804910096e-06, "loss": 0.79373658, "num_input_tokens_seen": 49217250, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.796875, "step": 2274, "time_per_iteration": 2.3578431606292725 }, { "auxiliary_loss_clip": 0.01112591, "auxiliary_loss_mlp": 0.01045059, "balance_loss_clip": 1.02202773, "balance_loss_mlp": 1.03039908, "epoch": 0.13678039981963025, "flos": 24533780987520.0, "grad_norm": 2.473963726005356, "language_loss": 0.7832284, "learning_rate": 3.818435517619355e-06, "loss": 0.80480492, "num_input_tokens_seen": 49236615, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.8203125, "step": 2275, "time_per_iteration": 2.43884539604187 }, { "auxiliary_loss_clip": 0.01110935, "auxiliary_loss_mlp": 0.01041719, "balance_loss_clip": 1.02028525, "balance_loss_mlp": 1.0302285, "epoch": 0.13684052307229821, "flos": 15668425491840.0, "grad_norm": 2.7714124972923755, "language_loss": 0.81413603, "learning_rate": 3.818278189852415e-06, "loss": 0.83566254, "num_input_tokens_seen": 49253935, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.80859375, "step": 2276, "time_per_iteration": 2.362184524536133 }, { "auxiliary_loss_clip": 0.01119973, "auxiliary_loss_mlp": 0.01049788, "balance_loss_clip": 1.02260852, "balance_loss_mlp": 1.03208447, "epoch": 0.13690064632496618, "flos": 28363473843840.0, "grad_norm": 2.480011971937364, "language_loss": 0.69309795, "learning_rate": 3.8181207971958025e-06, "loss": 0.71479559, "num_input_tokens_seen": 49273605, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 0.87890625, "step": 2277, "time_per_iteration": 2.4605045318603516 }, { "auxiliary_loss_clip": 0.01112767, "auxiliary_loss_mlp": 0.01054563, "balance_loss_clip": 1.03097129, "balance_loss_mlp": 1.03087139, "epoch": 0.13696076957763414, "flos": 23403642410880.0, "grad_norm": 2.1244063974828564, "language_loss": 0.80648291, "learning_rate": 3.817963339655137e-06, "loss": 0.82815623, "num_input_tokens_seen": 49291785, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.8203125, "step": 2278, "time_per_iteration": 2.3916478157043457 }, { "auxiliary_loss_clip": 0.01112518, "auxiliary_loss_mlp": 0.01041664, "balance_loss_clip": 1.01930034, "balance_loss_mlp": 1.03335357, "epoch": 0.1370208928303021, "flos": 37195068188160.0, "grad_norm": 2.334388181097841, "language_loss": 0.7501992, "learning_rate": 3.8178058172360346e-06, "loss": 0.77174109, "num_input_tokens_seen": 49311405, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.7890625, "step": 2279, "time_per_iteration": 2.517383098602295 }, { "auxiliary_loss_clip": 0.0111511, "auxiliary_loss_mlp": 0.01046556, "balance_loss_clip": 1.02408552, "balance_loss_mlp": 1.03076339, "epoch": 0.13708101608297008, "flos": 26975187607680.0, "grad_norm": 1.8940635031675936, "language_loss": 0.76659471, "learning_rate": 3.817648229944119e-06, "loss": 0.78821135, "num_input_tokens_seen": 49331835, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.84375, "step": 2280, "time_per_iteration": 2.4143829345703125 }, { "auxiliary_loss_clip": 0.01107492, "auxiliary_loss_mlp": 0.01041696, "balance_loss_clip": 1.01880753, "balance_loss_mlp": 1.02764416, "epoch": 0.13714113933563807, "flos": 32555635102080.0, "grad_norm": 1.73368062530258, "language_loss": 0.79739249, "learning_rate": 3.817490577785014e-06, "loss": 0.81888437, "num_input_tokens_seen": 49352290, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.796875, "step": 2281, "time_per_iteration": 2.465832233428955 }, { "auxiliary_loss_clip": 0.01117367, "auxiliary_loss_mlp": 0.01047146, "balance_loss_clip": 1.0238409, "balance_loss_mlp": 1.03159499, "epoch": 0.13720126258830603, "flos": 16100510878080.0, "grad_norm": 1.7304683509529122, "language_loss": 0.83738309, "learning_rate": 3.817332860764346e-06, "loss": 0.85902822, "num_input_tokens_seen": 49370285, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.859375, "step": 2282, "time_per_iteration": 2.3765079975128174 }, { "auxiliary_loss_clip": 0.01109256, "auxiliary_loss_mlp": 0.01042945, "balance_loss_clip": 1.02089167, "balance_loss_mlp": 1.02886319, "epoch": 0.137261385840974, "flos": 18952530531840.0, "grad_norm": 1.6378516218141752, "language_loss": 0.73454171, "learning_rate": 3.817175078887742e-06, "loss": 0.75606376, "num_input_tokens_seen": 49389610, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.8046875, "step": 2283, "time_per_iteration": 2.3719022274017334 }, { "auxiliary_loss_clip": 0.01113893, "auxiliary_loss_mlp": 0.01046806, "balance_loss_clip": 1.02537262, "balance_loss_mlp": 1.03361201, "epoch": 0.13732150909364196, "flos": 23294224609920.0, "grad_norm": 2.343842559962333, "language_loss": 0.83827215, "learning_rate": 3.8170172321608345e-06, "loss": 0.85987914, "num_input_tokens_seen": 49408390, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.8046875, "step": 2284, "time_per_iteration": 2.398379325866699 }, { "auxiliary_loss_clip": 0.01116226, "auxiliary_loss_mlp": 0.01046771, "balance_loss_clip": 1.02210665, "balance_loss_mlp": 1.03020883, "epoch": 0.13738163234630993, "flos": 29349979620480.0, "grad_norm": 1.769826224992319, "language_loss": 0.74995393, "learning_rate": 3.816859320589255e-06, "loss": 0.77158391, "num_input_tokens_seen": 49427725, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.859375, "step": 2285, "time_per_iteration": 2.5012168884277344 }, { "auxiliary_loss_clip": 0.0111146, "auxiliary_loss_mlp": 0.01044058, "balance_loss_clip": 1.02106261, "balance_loss_mlp": 1.03127074, "epoch": 0.1374417555989779, "flos": 26650111138560.0, "grad_norm": 1.7714291008538752, "language_loss": 0.74398136, "learning_rate": 3.81670134417864e-06, "loss": 0.76553655, "num_input_tokens_seen": 49449000, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.80078125, "step": 2286, "time_per_iteration": 2.4326276779174805 }, { "auxiliary_loss_clip": 0.01117541, "auxiliary_loss_mlp": 0.01050704, "balance_loss_clip": 1.02468061, "balance_loss_mlp": 1.03174639, "epoch": 0.1375018788516459, "flos": 28402122585600.0, "grad_norm": 2.0022977447187134, "language_loss": 0.86365223, "learning_rate": 3.8165433029346276e-06, "loss": 0.88533461, "num_input_tokens_seen": 49468360, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 0.859375, "step": 2287, "time_per_iteration": 2.4316582679748535 }, { "auxiliary_loss_clip": 0.0111424, "auxiliary_loss_mlp": 0.01047324, "balance_loss_clip": 1.02416182, "balance_loss_mlp": 1.03153014, "epoch": 0.13756200210431385, "flos": 37412297867520.0, "grad_norm": 1.8374540548779694, "language_loss": 0.68856287, "learning_rate": 3.816385196862858e-06, "loss": 0.71017849, "num_input_tokens_seen": 49493450, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.828125, "step": 2288, "time_per_iteration": 2.5205066204071045 }, { "auxiliary_loss_clip": 0.01114996, "auxiliary_loss_mlp": 0.01044784, "balance_loss_clip": 1.02221727, "balance_loss_mlp": 1.03295159, "epoch": 0.13762212535698182, "flos": 22709918229120.0, "grad_norm": 2.3650727449351887, "language_loss": 0.86925477, "learning_rate": 3.816227025968972e-06, "loss": 0.89085257, "num_input_tokens_seen": 49511220, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.8203125, "step": 2289, "time_per_iteration": 2.3779609203338623 }, { "auxiliary_loss_clip": 0.01108751, "auxiliary_loss_mlp": 0.01043833, "balance_loss_clip": 1.02186263, "balance_loss_mlp": 1.02881837, "epoch": 0.13768224860964978, "flos": 23950975795200.0, "grad_norm": 1.8917479365362528, "language_loss": 0.74836767, "learning_rate": 3.8160687902586155e-06, "loss": 0.76989353, "num_input_tokens_seen": 49529820, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.796875, "step": 2290, "time_per_iteration": 2.411607027053833 }, { "auxiliary_loss_clip": 0.01026061, "auxiliary_loss_mlp": 0.01013156, "balance_loss_clip": 1.01010406, "balance_loss_mlp": 1.00359797, "epoch": 0.13774237186231775, "flos": 63586750510080.0, "grad_norm": 0.7022014569495892, "language_loss": 0.51588422, "learning_rate": 3.815910489737436e-06, "loss": 0.5362764, "num_input_tokens_seen": 49595325, "router_z_loss_clip": 0.03051758, "router_z_loss_mlp": 0.22460938, "step": 2291, "time_per_iteration": 3.0735068321228027 }, { "auxiliary_loss_clip": 0.01111884, "auxiliary_loss_mlp": 0.01043238, "balance_loss_clip": 1.01936054, "balance_loss_mlp": 1.03058958, "epoch": 0.1378024951149857, "flos": 24278321502720.0, "grad_norm": 1.7932632724464097, "language_loss": 0.70804548, "learning_rate": 3.815752124411081e-06, "loss": 0.72959673, "num_input_tokens_seen": 49615850, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.81640625, "step": 2292, "time_per_iteration": 2.408328056335449 }, { "auxiliary_loss_clip": 0.01112536, "auxiliary_loss_mlp": 0.01049368, "balance_loss_clip": 1.02622998, "balance_loss_mlp": 1.03175652, "epoch": 0.13786261836765368, "flos": 14020839521280.0, "grad_norm": 2.592992239259999, "language_loss": 0.80301976, "learning_rate": 3.815593694285204e-06, "loss": 0.82463878, "num_input_tokens_seen": 49631860, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.80859375, "step": 2293, "time_per_iteration": 2.3571887016296387 }, { "auxiliary_loss_clip": 0.01113389, "auxiliary_loss_mlp": 0.01049373, "balance_loss_clip": 1.0256741, "balance_loss_mlp": 1.03126013, "epoch": 0.13792274162032167, "flos": 28877360279040.0, "grad_norm": 2.1590697829436465, "language_loss": 0.78471428, "learning_rate": 3.815435199365459e-06, "loss": 0.80634189, "num_input_tokens_seen": 49652145, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.8203125, "step": 2294, "time_per_iteration": 2.459261894226074 }, { "auxiliary_loss_clip": 0.01113719, "auxiliary_loss_mlp": 0.01044648, "balance_loss_clip": 1.02364314, "balance_loss_mlp": 1.03316617, "epoch": 0.13798286487298964, "flos": 21140118501120.0, "grad_norm": 2.209995404447119, "language_loss": 0.80169517, "learning_rate": 3.815276639657501e-06, "loss": 0.82327884, "num_input_tokens_seen": 49669880, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.8046875, "step": 2295, "time_per_iteration": 2.372267484664917 }, { "auxiliary_loss_clip": 0.01111241, "auxiliary_loss_mlp": 0.0104571, "balance_loss_clip": 1.02134347, "balance_loss_mlp": 1.02965164, "epoch": 0.1380429881256576, "flos": 22486509239040.0, "grad_norm": 1.8298599221433658, "language_loss": 0.78164601, "learning_rate": 3.815118015166989e-06, "loss": 0.80321556, "num_input_tokens_seen": 49687255, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.81640625, "step": 2296, "time_per_iteration": 2.387343406677246 }, { "auxiliary_loss_clip": 0.01116511, "auxiliary_loss_mlp": 0.01047356, "balance_loss_clip": 1.0243485, "balance_loss_mlp": 1.0339613, "epoch": 0.13810311137832557, "flos": 21392715254400.0, "grad_norm": 1.833720412786261, "language_loss": 0.78415352, "learning_rate": 3.814959325899584e-06, "loss": 0.80579221, "num_input_tokens_seen": 49706650, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.828125, "step": 2297, "time_per_iteration": 2.3892436027526855 }, { "auxiliary_loss_clip": 0.01111782, "auxiliary_loss_mlp": 0.01047091, "balance_loss_clip": 1.02507257, "balance_loss_mlp": 1.03123212, "epoch": 0.13816323463099353, "flos": 25988786565120.0, "grad_norm": 2.3329139411238775, "language_loss": 0.68648392, "learning_rate": 3.81480057186095e-06, "loss": 0.70807266, "num_input_tokens_seen": 49725715, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.8046875, "step": 2298, "time_per_iteration": 2.412243604660034 }, { "auxiliary_loss_clip": 0.01116407, "auxiliary_loss_mlp": 0.01051851, "balance_loss_clip": 1.02841473, "balance_loss_mlp": 1.0311178, "epoch": 0.1382233578836615, "flos": 19243322179200.0, "grad_norm": 2.047238548430911, "language_loss": 0.86757356, "learning_rate": 3.814641753056751e-06, "loss": 0.88925612, "num_input_tokens_seen": 49744710, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.8515625, "step": 2299, "time_per_iteration": 2.362157106399536 }, { "auxiliary_loss_clip": 0.01111331, "auxiliary_loss_mlp": 0.01052014, "balance_loss_clip": 1.02799284, "balance_loss_mlp": 1.02935147, "epoch": 0.1382834811363295, "flos": 25665106550400.0, "grad_norm": 1.782685339291963, "language_loss": 0.75776196, "learning_rate": 3.8144828694926565e-06, "loss": 0.77939546, "num_input_tokens_seen": 49764300, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.8203125, "step": 2300, "time_per_iteration": 2.427091121673584 }, { "auxiliary_loss_clip": 0.0111164, "auxiliary_loss_mlp": 0.01046812, "balance_loss_clip": 1.02509212, "balance_loss_mlp": 1.03253913, "epoch": 0.13834360438899745, "flos": 19783394000640.0, "grad_norm": 2.815439832138442, "language_loss": 0.8307541, "learning_rate": 3.814323921174335e-06, "loss": 0.85233855, "num_input_tokens_seen": 49778380, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.7890625, "step": 2301, "time_per_iteration": 2.347304582595825 }, { "auxiliary_loss_clip": 0.01109228, "auxiliary_loss_mlp": 0.0104721, "balance_loss_clip": 1.0243578, "balance_loss_mlp": 1.03007984, "epoch": 0.13840372764166542, "flos": 26650634808960.0, "grad_norm": 1.8641862102081654, "language_loss": 0.85776269, "learning_rate": 3.81416490810746e-06, "loss": 0.87932712, "num_input_tokens_seen": 49797460, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.7890625, "step": 2302, "time_per_iteration": 2.4252429008483887 }, { "auxiliary_loss_clip": 0.01025471, "auxiliary_loss_mlp": 0.01006036, "balance_loss_clip": 1.00288916, "balance_loss_mlp": 1.00358677, "epoch": 0.13846385089433338, "flos": 70507444343040.0, "grad_norm": 0.7557448061426103, "language_loss": 0.65586698, "learning_rate": 3.814005830297706e-06, "loss": 0.67618203, "num_input_tokens_seen": 49868005, "router_z_loss_clip": 0.03149414, "router_z_loss_mlp": 0.21875, "step": 2303, "time_per_iteration": 3.1661269664764404 }, { "auxiliary_loss_clip": 0.01109203, "auxiliary_loss_mlp": 0.01046523, "balance_loss_clip": 1.02403975, "balance_loss_mlp": 1.03107214, "epoch": 0.13852397414700135, "flos": 17347747754880.0, "grad_norm": 1.7502725857789592, "language_loss": 0.78403562, "learning_rate": 3.81384668775075e-06, "loss": 0.80559289, "num_input_tokens_seen": 49885825, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.78125, "step": 2304, "time_per_iteration": 2.371805191040039 }, { "auxiliary_loss_clip": 0.0111554, "auxiliary_loss_mlp": 0.01040256, "balance_loss_clip": 1.01833344, "balance_loss_mlp": 1.03259957, "epoch": 0.13858409739966931, "flos": 21542701921920.0, "grad_norm": 2.023868147563291, "language_loss": 0.77400017, "learning_rate": 3.8136874804722724e-06, "loss": 0.79555821, "num_input_tokens_seen": 49905975, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.83203125, "step": 2305, "time_per_iteration": 2.439689874649048 }, { "auxiliary_loss_clip": 0.01109113, "auxiliary_loss_mlp": 0.01044396, "balance_loss_clip": 1.02222347, "balance_loss_mlp": 1.02947581, "epoch": 0.13864422065233728, "flos": 21578837045760.0, "grad_norm": 1.7596167687772786, "language_loss": 0.87383056, "learning_rate": 3.813528208467953e-06, "loss": 0.89536566, "num_input_tokens_seen": 49925800, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.796875, "step": 2306, "time_per_iteration": 2.4078853130340576 }, { "auxiliary_loss_clip": 0.01024399, "auxiliary_loss_mlp": 0.01003031, "balance_loss_clip": 1.00005126, "balance_loss_mlp": 1.00275683, "epoch": 0.13870434390500527, "flos": 53368861743360.0, "grad_norm": 0.8668211906086138, "language_loss": 0.58999717, "learning_rate": 3.813368871743477e-06, "loss": 0.61027151, "num_input_tokens_seen": 49977620, "router_z_loss_clip": 0.02978516, "router_z_loss_mlp": 0.21679688, "step": 2307, "time_per_iteration": 4.485899925231934 }, { "auxiliary_loss_clip": 0.0111669, "auxiliary_loss_mlp": 0.01045172, "balance_loss_clip": 1.02103209, "balance_loss_mlp": 1.03269196, "epoch": 0.13876446715767324, "flos": 22564784240640.0, "grad_norm": 2.4277061628479344, "language_loss": 0.79327637, "learning_rate": 3.813209470304531e-06, "loss": 0.81489497, "num_input_tokens_seen": 49996650, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 0.83984375, "step": 2308, "time_per_iteration": 3.7915420532226562 }, { "auxiliary_loss_clip": 0.01112279, "auxiliary_loss_mlp": 0.01039478, "balance_loss_clip": 1.01644635, "balance_loss_mlp": 1.03122795, "epoch": 0.1388245904103412, "flos": 20704157953920.0, "grad_norm": 2.7725072846561845, "language_loss": 0.77483279, "learning_rate": 3.813050004156802e-06, "loss": 0.79635036, "num_input_tokens_seen": 50015640, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.80859375, "step": 2309, "time_per_iteration": 3.744729995727539 }, { "auxiliary_loss_clip": 0.01115686, "auxiliary_loss_mlp": 0.01039425, "balance_loss_clip": 1.01646543, "balance_loss_mlp": 1.03120661, "epoch": 0.13888471366300917, "flos": 20553787261440.0, "grad_norm": 1.9923646727733035, "language_loss": 0.67644227, "learning_rate": 3.812890473305983e-06, "loss": 0.6979934, "num_input_tokens_seen": 50033500, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.84375, "step": 2310, "time_per_iteration": 3.7314958572387695 }, { "auxiliary_loss_clip": 0.01114428, "auxiliary_loss_mlp": 0.01042951, "balance_loss_clip": 1.01878774, "balance_loss_mlp": 1.03177369, "epoch": 0.13894483691567713, "flos": 13837370993280.0, "grad_norm": 1.927902290859768, "language_loss": 0.83659101, "learning_rate": 3.812730877757766e-06, "loss": 0.85816479, "num_input_tokens_seen": 50050075, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.828125, "step": 2311, "time_per_iteration": 2.350458860397339 }, { "auxiliary_loss_clip": 0.01116978, "auxiliary_loss_mlp": 0.01043033, "balance_loss_clip": 1.01932216, "balance_loss_mlp": 1.03216636, "epoch": 0.1390049601683451, "flos": 28030123382400.0, "grad_norm": 1.9602687914704884, "language_loss": 0.81861597, "learning_rate": 3.812571217517847e-06, "loss": 0.84021604, "num_input_tokens_seen": 50070080, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.84765625, "step": 2312, "time_per_iteration": 2.4183192253112793 }, { "auxiliary_loss_clip": 0.01115909, "auxiliary_loss_mlp": 0.01043386, "balance_loss_clip": 1.02151084, "balance_loss_mlp": 1.03175282, "epoch": 0.13906508342101306, "flos": 26755758512640.0, "grad_norm": 1.6884825216715873, "language_loss": 0.86466634, "learning_rate": 3.8124114925919234e-06, "loss": 0.88625932, "num_input_tokens_seen": 50090040, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.83984375, "step": 2313, "time_per_iteration": 2.4165217876434326 }, { "auxiliary_loss_clip": 0.01113991, "auxiliary_loss_mlp": 0.01050481, "balance_loss_clip": 1.02727151, "balance_loss_mlp": 1.03228283, "epoch": 0.13912520667368106, "flos": 24533955544320.0, "grad_norm": 1.9420178761846076, "language_loss": 0.79697347, "learning_rate": 3.812251702985696e-06, "loss": 0.81861818, "num_input_tokens_seen": 50110595, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.8203125, "step": 2314, "time_per_iteration": 2.4216740131378174 }, { "auxiliary_loss_clip": 0.01115102, "auxiliary_loss_mlp": 0.01042126, "balance_loss_clip": 1.01780772, "balance_loss_mlp": 1.03328633, "epoch": 0.13918532992634902, "flos": 19382416502400.0, "grad_norm": 6.510030730031837, "language_loss": 0.85251737, "learning_rate": 3.8120918487048673e-06, "loss": 0.87408972, "num_input_tokens_seen": 50125430, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.81640625, "step": 2315, "time_per_iteration": 2.3296632766723633 }, { "auxiliary_loss_clip": 0.01113169, "auxiliary_loss_mlp": 0.01047482, "balance_loss_clip": 1.02389026, "balance_loss_mlp": 1.03022432, "epoch": 0.139245453179017, "flos": 21322714245120.0, "grad_norm": 2.1342939804528664, "language_loss": 0.77397943, "learning_rate": 3.8119319297551417e-06, "loss": 0.79558593, "num_input_tokens_seen": 50144120, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.828125, "step": 2316, "time_per_iteration": 2.3798413276672363 }, { "auxiliary_loss_clip": 0.01112326, "auxiliary_loss_mlp": 0.01044963, "balance_loss_clip": 1.02010787, "balance_loss_mlp": 1.03112698, "epoch": 0.13930557643168495, "flos": 19499584625280.0, "grad_norm": 1.6088289410619419, "language_loss": 0.76960433, "learning_rate": 3.811771946142226e-06, "loss": 0.79117715, "num_input_tokens_seen": 50162500, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.8125, "step": 2317, "time_per_iteration": 2.3655471801757812 }, { "auxiliary_loss_clip": 0.01115677, "auxiliary_loss_mlp": 0.01044519, "balance_loss_clip": 1.02172649, "balance_loss_mlp": 1.03252137, "epoch": 0.13936569968435292, "flos": 25409647065600.0, "grad_norm": 1.8353313151848425, "language_loss": 0.80771768, "learning_rate": 3.8116118978718298e-06, "loss": 0.82931966, "num_input_tokens_seen": 50182415, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.83203125, "step": 2318, "time_per_iteration": 2.4340999126434326 }, { "auxiliary_loss_clip": 0.01022548, "auxiliary_loss_mlp": 0.01004856, "balance_loss_clip": 1.00195885, "balance_loss_mlp": 1.00148022, "epoch": 0.13942582293702088, "flos": 70767372481920.0, "grad_norm": 0.8539343675442279, "language_loss": 0.59066468, "learning_rate": 3.811451784949665e-06, "loss": 0.61093873, "num_input_tokens_seen": 50245160, "router_z_loss_clip": 0.02893066, "router_z_loss_mlp": 0.2109375, "step": 2319, "time_per_iteration": 3.0526671409606934 }, { "auxiliary_loss_clip": 0.01116876, "auxiliary_loss_mlp": 0.01048714, "balance_loss_clip": 1.02567124, "balance_loss_mlp": 1.03278899, "epoch": 0.13948594618968888, "flos": 35589412627200.0, "grad_norm": 2.446227278608528, "language_loss": 0.65113854, "learning_rate": 3.811291607381446e-06, "loss": 0.67279446, "num_input_tokens_seen": 50268215, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.84375, "step": 2320, "time_per_iteration": 2.50424861907959 }, { "auxiliary_loss_clip": 0.01112892, "auxiliary_loss_mlp": 0.01039067, "balance_loss_clip": 1.01576197, "balance_loss_mlp": 1.03142297, "epoch": 0.13954606944235684, "flos": 21104157934080.0, "grad_norm": 1.5250669828234587, "language_loss": 0.70898479, "learning_rate": 3.8111313651728887e-06, "loss": 0.73050439, "num_input_tokens_seen": 50288575, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.81640625, "step": 2321, "time_per_iteration": 2.387446165084839 }, { "auxiliary_loss_clip": 0.0111377, "auxiliary_loss_mlp": 0.01048863, "balance_loss_clip": 1.02609396, "balance_loss_mlp": 1.03025997, "epoch": 0.1396061926950248, "flos": 25043303502720.0, "grad_norm": 1.8167873724049057, "language_loss": 0.85633826, "learning_rate": 3.810971058329712e-06, "loss": 0.8779645, "num_input_tokens_seen": 50308735, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.8359375, "step": 2322, "time_per_iteration": 2.4280426502227783 }, { "auxiliary_loss_clip": 0.01107724, "auxiliary_loss_mlp": 0.010402, "balance_loss_clip": 1.01770544, "balance_loss_mlp": 1.02950573, "epoch": 0.13966631594769277, "flos": 37632495012480.0, "grad_norm": 1.7811434630357614, "language_loss": 0.67362523, "learning_rate": 3.810810686857636e-06, "loss": 0.69510448, "num_input_tokens_seen": 50331025, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.78125, "step": 2323, "time_per_iteration": 2.543933629989624 }, { "auxiliary_loss_clip": 0.01119111, "auxiliary_loss_mlp": 0.01041933, "balance_loss_clip": 1.01809096, "balance_loss_mlp": 1.03224778, "epoch": 0.13972643920036074, "flos": 16690053962880.0, "grad_norm": 1.88168631675888, "language_loss": 0.88742232, "learning_rate": 3.8106502507623847e-06, "loss": 0.90903276, "num_input_tokens_seen": 50349725, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.8671875, "step": 2324, "time_per_iteration": 2.3862104415893555 }, { "auxiliary_loss_clip": 0.01114269, "auxiliary_loss_mlp": 0.01046201, "balance_loss_clip": 1.02154899, "balance_loss_mlp": 1.02962196, "epoch": 0.1397865624530287, "flos": 23329940797440.0, "grad_norm": 2.4616348831774024, "language_loss": 0.70485055, "learning_rate": 3.810489750049684e-06, "loss": 0.72645521, "num_input_tokens_seen": 50367965, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.84375, "step": 2325, "time_per_iteration": 2.3760411739349365 }, { "auxiliary_loss_clip": 0.01114765, "auxiliary_loss_mlp": 0.0104497, "balance_loss_clip": 1.02260661, "balance_loss_mlp": 1.03328538, "epoch": 0.13984668570569667, "flos": 22777370709120.0, "grad_norm": 2.194120627576144, "language_loss": 0.81632841, "learning_rate": 3.810329184725261e-06, "loss": 0.83792573, "num_input_tokens_seen": 50385605, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.81640625, "step": 2326, "time_per_iteration": 2.393697500228882 }, { "auxiliary_loss_clip": 0.01113238, "auxiliary_loss_mlp": 0.01043437, "balance_loss_clip": 1.02263474, "balance_loss_mlp": 1.0323596, "epoch": 0.13990680895836466, "flos": 19463519324160.0, "grad_norm": 1.679143758752739, "language_loss": 0.88916981, "learning_rate": 3.8101685547948456e-06, "loss": 0.91073656, "num_input_tokens_seen": 50403985, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.80859375, "step": 2327, "time_per_iteration": 2.3681604862213135 }, { "auxiliary_loss_clip": 0.01111782, "auxiliary_loss_mlp": 0.01051522, "balance_loss_clip": 1.02977872, "balance_loss_mlp": 1.03227198, "epoch": 0.13996693221103262, "flos": 20302237848960.0, "grad_norm": 2.3029894270625078, "language_loss": 0.84684706, "learning_rate": 3.8100078602641714e-06, "loss": 0.86848009, "num_input_tokens_seen": 50421590, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.796875, "step": 2328, "time_per_iteration": 2.3757174015045166 }, { "auxiliary_loss_clip": 0.01113621, "auxiliary_loss_mlp": 0.01045352, "balance_loss_clip": 1.02246439, "balance_loss_mlp": 1.03034067, "epoch": 0.1400270554637006, "flos": 26616419809920.0, "grad_norm": 1.5164445313294197, "language_loss": 0.74061275, "learning_rate": 3.8098471011389723e-06, "loss": 0.7622025, "num_input_tokens_seen": 50443945, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.83203125, "step": 2329, "time_per_iteration": 2.428548574447632 }, { "auxiliary_loss_clip": 0.01112129, "auxiliary_loss_mlp": 0.01044525, "balance_loss_clip": 1.02216148, "balance_loss_mlp": 1.02910054, "epoch": 0.14008717871636855, "flos": 19390446115200.0, "grad_norm": 2.297219445526394, "language_loss": 0.7825973, "learning_rate": 3.809686277424986e-06, "loss": 0.80416381, "num_input_tokens_seen": 50462065, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.828125, "step": 2330, "time_per_iteration": 2.3700814247131348 }, { "auxiliary_loss_clip": 0.01110938, "auxiliary_loss_mlp": 0.01038475, "balance_loss_clip": 1.01670766, "balance_loss_mlp": 1.03057289, "epoch": 0.14014730196903652, "flos": 15303373649280.0, "grad_norm": 2.6356683996312147, "language_loss": 0.71626061, "learning_rate": 3.809525389127951e-06, "loss": 0.7377547, "num_input_tokens_seen": 50479565, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.8046875, "step": 2331, "time_per_iteration": 2.3808858394622803 }, { "auxiliary_loss_clip": 0.01108511, "auxiliary_loss_mlp": 0.01043533, "balance_loss_clip": 1.02311277, "balance_loss_mlp": 1.03137755, "epoch": 0.14020742522170448, "flos": 14938810565760.0, "grad_norm": 1.8915342608415047, "language_loss": 0.7251972, "learning_rate": 3.8093644362536094e-06, "loss": 0.74671763, "num_input_tokens_seen": 50497305, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.7734375, "step": 2332, "time_per_iteration": 2.3650176525115967 }, { "auxiliary_loss_clip": 0.01022938, "auxiliary_loss_mlp": 0.01007537, "balance_loss_clip": 1.00495028, "balance_loss_mlp": 1.00183344, "epoch": 0.14026754847437245, "flos": 48822017316480.0, "grad_norm": 0.8126341124100568, "language_loss": 0.56089938, "learning_rate": 3.809203418807706e-06, "loss": 0.58120418, "num_input_tokens_seen": 50549735, "router_z_loss_clip": 0.02587891, "router_z_loss_mlp": 0.2109375, "step": 2333, "time_per_iteration": 2.8726682662963867 }, { "auxiliary_loss_clip": 0.01113125, "auxiliary_loss_mlp": 0.01046742, "balance_loss_clip": 1.02356756, "balance_loss_mlp": 1.03146529, "epoch": 0.14032767172704044, "flos": 25772150378880.0, "grad_norm": 1.6596230994853203, "language_loss": 0.82564056, "learning_rate": 3.8090423367959862e-06, "loss": 0.84723926, "num_input_tokens_seen": 50570100, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.81640625, "step": 2334, "time_per_iteration": 2.431256055831909 }, { "auxiliary_loss_clip": 0.01109192, "auxiliary_loss_mlp": 0.01039683, "balance_loss_clip": 1.01901162, "balance_loss_mlp": 1.02880728, "epoch": 0.1403877949797084, "flos": 21215216568960.0, "grad_norm": 1.8070704326569684, "language_loss": 0.81511354, "learning_rate": 3.8088811902241984e-06, "loss": 0.83660233, "num_input_tokens_seen": 50589185, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.8046875, "step": 2335, "time_per_iteration": 2.418689012527466 }, { "auxiliary_loss_clip": 0.01118209, "auxiliary_loss_mlp": 0.01050244, "balance_loss_clip": 1.02465022, "balance_loss_mlp": 1.03229046, "epoch": 0.14044791823237637, "flos": 22746856314240.0, "grad_norm": 1.5857080281952594, "language_loss": 0.8213681, "learning_rate": 3.8087199790980943e-06, "loss": 0.84305263, "num_input_tokens_seen": 50609645, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 0.859375, "step": 2336, "time_per_iteration": 2.3965723514556885 }, { "auxiliary_loss_clip": 0.01112836, "auxiliary_loss_mlp": 0.01041006, "balance_loss_clip": 1.01849961, "balance_loss_mlp": 1.03063273, "epoch": 0.14050804148504434, "flos": 22963387766400.0, "grad_norm": 1.6075359232070303, "language_loss": 0.80349731, "learning_rate": 3.8085587034234268e-06, "loss": 0.82503575, "num_input_tokens_seen": 50628385, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.8203125, "step": 2337, "time_per_iteration": 2.393561363220215 }, { "auxiliary_loss_clip": 0.01115161, "auxiliary_loss_mlp": 0.01051235, "balance_loss_clip": 1.02847803, "balance_loss_mlp": 1.03135502, "epoch": 0.1405681647377123, "flos": 22199243639040.0, "grad_norm": 3.0750946669038184, "language_loss": 0.79212838, "learning_rate": 3.8083973632059507e-06, "loss": 0.81379235, "num_input_tokens_seen": 50647260, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.83984375, "step": 2338, "time_per_iteration": 2.384571075439453 }, { "auxiliary_loss_clip": 0.01119153, "auxiliary_loss_mlp": 0.01042763, "balance_loss_clip": 1.01802742, "balance_loss_mlp": 1.03551793, "epoch": 0.14062828799038027, "flos": 23731651434240.0, "grad_norm": 2.5063985567556615, "language_loss": 0.79717278, "learning_rate": 3.8082359584514254e-06, "loss": 0.81879199, "num_input_tokens_seen": 50666130, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.8359375, "step": 2339, "time_per_iteration": 2.4049720764160156 }, { "auxiliary_loss_clip": 0.01113783, "auxiliary_loss_mlp": 0.01044787, "balance_loss_clip": 1.02204204, "balance_loss_mlp": 1.03190649, "epoch": 0.14068841124304826, "flos": 39200933197440.0, "grad_norm": 2.0345611069085847, "language_loss": 0.65627486, "learning_rate": 3.8080744891656095e-06, "loss": 0.67786056, "num_input_tokens_seen": 50687440, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.8203125, "step": 2340, "time_per_iteration": 2.5427424907684326 }, { "auxiliary_loss_clip": 0.01112406, "auxiliary_loss_mlp": 0.01043054, "balance_loss_clip": 1.01995158, "balance_loss_mlp": 1.03261447, "epoch": 0.14074853449571623, "flos": 20191283948160.0, "grad_norm": 2.969123965413322, "language_loss": 0.77967715, "learning_rate": 3.807912955354266e-06, "loss": 0.80123174, "num_input_tokens_seen": 50704030, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.796875, "step": 2341, "time_per_iteration": 2.3721840381622314 }, { "auxiliary_loss_clip": 0.0110888, "auxiliary_loss_mlp": 0.01044925, "balance_loss_clip": 1.02176309, "balance_loss_mlp": 1.03030443, "epoch": 0.1408086577483842, "flos": 18404882945280.0, "grad_norm": 1.8885129295917387, "language_loss": 0.80213922, "learning_rate": 3.80775135702316e-06, "loss": 0.8236773, "num_input_tokens_seen": 50723305, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.78515625, "step": 2342, "time_per_iteration": 2.3720855712890625 }, { "auxiliary_loss_clip": 0.01110901, "auxiliary_loss_mlp": 0.01046541, "balance_loss_clip": 1.0253576, "balance_loss_mlp": 1.03243005, "epoch": 0.14086878100105216, "flos": 25263430824960.0, "grad_norm": 1.9349407206328697, "language_loss": 0.78248572, "learning_rate": 3.8075896941780576e-06, "loss": 0.80406016, "num_input_tokens_seen": 50743270, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.78515625, "step": 2343, "time_per_iteration": 2.4285085201263428 }, { "auxiliary_loss_clip": 0.01024611, "auxiliary_loss_mlp": 0.01005424, "balance_loss_clip": 1.00275409, "balance_loss_mlp": 1.00350428, "epoch": 0.14092890425372012, "flos": 65975194730880.0, "grad_norm": 0.9113845422778512, "language_loss": 0.61496663, "learning_rate": 3.807427966824729e-06, "loss": 0.63526696, "num_input_tokens_seen": 50802710, "router_z_loss_clip": 0.0267334, "router_z_loss_mlp": 0.2109375, "step": 2344, "time_per_iteration": 2.959873914718628 }, { "auxiliary_loss_clip": 0.01110289, "auxiliary_loss_mlp": 0.0104094, "balance_loss_clip": 1.01946998, "balance_loss_mlp": 1.03021097, "epoch": 0.1409890275063881, "flos": 23693875476480.0, "grad_norm": 1.5345214861639942, "language_loss": 0.644485, "learning_rate": 3.807266174968946e-06, "loss": 0.66599727, "num_input_tokens_seen": 50822625, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.80078125, "step": 2345, "time_per_iteration": 2.4022178649902344 }, { "auxiliary_loss_clip": 0.0111492, "auxiliary_loss_mlp": 0.0103954, "balance_loss_clip": 1.01692581, "balance_loss_mlp": 1.02984154, "epoch": 0.14104915075905605, "flos": 23622024165120.0, "grad_norm": 3.732364891862398, "language_loss": 0.72913074, "learning_rate": 3.8071043186164813e-06, "loss": 0.75067532, "num_input_tokens_seen": 50842330, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.8515625, "step": 2346, "time_per_iteration": 3.8668501377105713 }, { "auxiliary_loss_clip": 0.01115043, "auxiliary_loss_mlp": 0.01048066, "balance_loss_clip": 1.02484357, "balance_loss_mlp": 1.03207099, "epoch": 0.14110927401172405, "flos": 20594111748480.0, "grad_norm": 3.1926601726259403, "language_loss": 0.77061605, "learning_rate": 3.8069423977731123e-06, "loss": 0.79224718, "num_input_tokens_seen": 50861035, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.828125, "step": 2347, "time_per_iteration": 3.761927366256714 }, { "auxiliary_loss_clip": 0.01112526, "auxiliary_loss_mlp": 0.01043529, "balance_loss_clip": 1.02202344, "balance_loss_mlp": 1.02968144, "epoch": 0.141169397264392, "flos": 28546802726400.0, "grad_norm": 2.338335469689385, "language_loss": 0.76286185, "learning_rate": 3.8067804124446167e-06, "loss": 0.78442234, "num_input_tokens_seen": 50880105, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.828125, "step": 2348, "time_per_iteration": 2.4172322750091553 }, { "auxiliary_loss_clip": 0.01113536, "auxiliary_loss_mlp": 0.01045398, "balance_loss_clip": 1.02209258, "balance_loss_mlp": 1.03189969, "epoch": 0.14122952051705998, "flos": 17091310752000.0, "grad_norm": 1.714294255083456, "language_loss": 0.86320311, "learning_rate": 3.806618362636776e-06, "loss": 0.88479245, "num_input_tokens_seen": 50897720, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.8203125, "step": 2349, "time_per_iteration": 3.782675266265869 }, { "auxiliary_loss_clip": 0.01112457, "auxiliary_loss_mlp": 0.01042913, "balance_loss_clip": 1.02048922, "balance_loss_mlp": 1.03230882, "epoch": 0.14128964376972794, "flos": 28945615720320.0, "grad_norm": 1.6260149264212769, "language_loss": 0.89123261, "learning_rate": 3.806456248355373e-06, "loss": 0.91278625, "num_input_tokens_seen": 50918385, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.80078125, "step": 2350, "time_per_iteration": 3.825753927230835 }, { "auxiliary_loss_clip": 0.01117982, "auxiliary_loss_mlp": 0.01044442, "balance_loss_clip": 1.02080297, "balance_loss_mlp": 1.03451514, "epoch": 0.1413497670223959, "flos": 18988770389760.0, "grad_norm": 1.6992135889614395, "language_loss": 0.81226486, "learning_rate": 3.806294069606194e-06, "loss": 0.83388907, "num_input_tokens_seen": 50938270, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 0.8359375, "step": 2351, "time_per_iteration": 2.3727540969848633 }, { "auxiliary_loss_clip": 0.01115487, "auxiliary_loss_mlp": 0.01041299, "balance_loss_clip": 1.01912642, "balance_loss_mlp": 1.03287506, "epoch": 0.14140989027506387, "flos": 29860933501440.0, "grad_norm": 2.4282329595428696, "language_loss": 0.83351785, "learning_rate": 3.806131826395025e-06, "loss": 0.85508567, "num_input_tokens_seen": 50958155, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.828125, "step": 2352, "time_per_iteration": 2.443225860595703 }, { "auxiliary_loss_clip": 0.01023041, "auxiliary_loss_mlp": 0.01003801, "balance_loss_clip": 1.001441, "balance_loss_mlp": 1.00168443, "epoch": 0.14147001352773186, "flos": 62076303826560.0, "grad_norm": 0.9060145135068637, "language_loss": 0.61919022, "learning_rate": 3.805969518727658e-06, "loss": 0.63945866, "num_input_tokens_seen": 51020705, "router_z_loss_clip": 0.02355957, "router_z_loss_mlp": 0.21289062, "step": 2353, "time_per_iteration": 2.951414108276367 }, { "auxiliary_loss_clip": 0.01110853, "auxiliary_loss_mlp": 0.01044185, "balance_loss_clip": 1.02258432, "balance_loss_mlp": 1.0318141, "epoch": 0.14153013678039983, "flos": 22016438426880.0, "grad_norm": 1.6978982977159032, "language_loss": 0.87054855, "learning_rate": 3.805807146609884e-06, "loss": 0.8920989, "num_input_tokens_seen": 51039995, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.7890625, "step": 2354, "time_per_iteration": 2.403468132019043 }, { "auxiliary_loss_clip": 0.01113959, "auxiliary_loss_mlp": 0.0104906, "balance_loss_clip": 1.02529013, "balance_loss_mlp": 1.03166056, "epoch": 0.1415902600330678, "flos": 19719048631680.0, "grad_norm": 2.191536556128632, "language_loss": 0.74257559, "learning_rate": 3.8056447100474976e-06, "loss": 0.76420581, "num_input_tokens_seen": 51059075, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.82421875, "step": 2355, "time_per_iteration": 2.3797683715820312 }, { "auxiliary_loss_clip": 0.01022638, "auxiliary_loss_mlp": 0.01004672, "balance_loss_clip": 1.00213301, "balance_loss_mlp": 1.00106001, "epoch": 0.14165038328573576, "flos": 65897862336000.0, "grad_norm": 0.6813695692162474, "language_loss": 0.51837111, "learning_rate": 3.8054822090462963e-06, "loss": 0.53864413, "num_input_tokens_seen": 51120380, "router_z_loss_clip": 0.02539062, "router_z_loss_mlp": 0.21484375, "step": 2356, "time_per_iteration": 2.9988350868225098 }, { "auxiliary_loss_clip": 0.01111905, "auxiliary_loss_mlp": 0.01049433, "balance_loss_clip": 1.0279392, "balance_loss_mlp": 1.03195858, "epoch": 0.14171050653840372, "flos": 12129349726080.0, "grad_norm": 2.2499940840965778, "language_loss": 0.71124399, "learning_rate": 3.80531964361208e-06, "loss": 0.73285735, "num_input_tokens_seen": 51136950, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.796875, "step": 2357, "time_per_iteration": 2.3525867462158203 }, { "auxiliary_loss_clip": 0.01115533, "auxiliary_loss_mlp": 0.01045419, "balance_loss_clip": 1.02374637, "balance_loss_mlp": 1.0324074, "epoch": 0.1417706297910717, "flos": 20411446181760.0, "grad_norm": 3.382316394087526, "language_loss": 0.81723762, "learning_rate": 3.8051570137506485e-06, "loss": 0.8388471, "num_input_tokens_seen": 51155175, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.83203125, "step": 2358, "time_per_iteration": 2.3947958946228027 }, { "auxiliary_loss_clip": 0.01116677, "auxiliary_loss_mlp": 0.0104764, "balance_loss_clip": 1.02472782, "balance_loss_mlp": 1.03283024, "epoch": 0.14183075304373965, "flos": 22379570144640.0, "grad_norm": 2.021207632014741, "language_loss": 0.71728957, "learning_rate": 3.804994319467807e-06, "loss": 0.73893273, "num_input_tokens_seen": 51174500, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.8359375, "step": 2359, "time_per_iteration": 2.4015228748321533 }, { "auxiliary_loss_clip": 0.01110608, "auxiliary_loss_mlp": 0.01036418, "balance_loss_clip": 1.01448345, "balance_loss_mlp": 1.03044295, "epoch": 0.14189087629640765, "flos": 21579814563840.0, "grad_norm": 2.0088275417241963, "language_loss": 0.75609106, "learning_rate": 3.804831560769361e-06, "loss": 0.77756137, "num_input_tokens_seen": 51194270, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.80078125, "step": 2360, "time_per_iteration": 2.3866117000579834 }, { "auxiliary_loss_clip": 0.01111825, "auxiliary_loss_mlp": 0.01045388, "balance_loss_clip": 1.02290535, "balance_loss_mlp": 1.0320853, "epoch": 0.1419509995490756, "flos": 20007605952000.0, "grad_norm": 1.9198538889155847, "language_loss": 0.81491876, "learning_rate": 3.8046687376611196e-06, "loss": 0.83649093, "num_input_tokens_seen": 51211850, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.796875, "step": 2361, "time_per_iteration": 2.364715337753296 }, { "auxiliary_loss_clip": 0.0111196, "auxiliary_loss_mlp": 0.01043956, "balance_loss_clip": 1.02096045, "balance_loss_mlp": 1.0317812, "epoch": 0.14201112280174358, "flos": 31940116099200.0, "grad_norm": 1.9110472055203933, "language_loss": 0.74089873, "learning_rate": 3.8045058501488927e-06, "loss": 0.76245791, "num_input_tokens_seen": 51233545, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.8046875, "step": 2362, "time_per_iteration": 2.4871034622192383 }, { "auxiliary_loss_clip": 0.01113443, "auxiliary_loss_mlp": 0.01040609, "balance_loss_clip": 1.01844823, "balance_loss_mlp": 1.03260601, "epoch": 0.14207124605441154, "flos": 41462536982400.0, "grad_norm": 1.6877583599104975, "language_loss": 0.73817307, "learning_rate": 3.804342898238494e-06, "loss": 0.75971359, "num_input_tokens_seen": 51257615, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.80859375, "step": 2363, "time_per_iteration": 2.5701701641082764 }, { "auxiliary_loss_clip": 0.01111354, "auxiliary_loss_mlp": 0.01042218, "balance_loss_clip": 1.02141619, "balance_loss_mlp": 1.03112841, "epoch": 0.1421313693070795, "flos": 31903736595840.0, "grad_norm": 1.77503551669461, "language_loss": 0.72893143, "learning_rate": 3.8041798819357386e-06, "loss": 0.75046718, "num_input_tokens_seen": 51279645, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.8046875, "step": 2364, "time_per_iteration": 2.4738662242889404 }, { "auxiliary_loss_clip": 0.01108281, "auxiliary_loss_mlp": 0.01039316, "balance_loss_clip": 1.01900244, "balance_loss_mlp": 1.03060389, "epoch": 0.14219149255974747, "flos": 26869924258560.0, "grad_norm": 2.37641305853089, "language_loss": 0.9059546, "learning_rate": 3.804016801246444e-06, "loss": 0.92743063, "num_input_tokens_seen": 51299775, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.77734375, "step": 2365, "time_per_iteration": 2.4367516040802 }, { "auxiliary_loss_clip": 0.01110698, "auxiliary_loss_mlp": 0.01040614, "balance_loss_clip": 1.01842928, "balance_loss_mlp": 1.02955818, "epoch": 0.14225161581241544, "flos": 27453183298560.0, "grad_norm": 1.680012895617236, "language_loss": 0.65590346, "learning_rate": 3.80385365617643e-06, "loss": 0.67741668, "num_input_tokens_seen": 51319430, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.8125, "step": 2366, "time_per_iteration": 2.4501473903656006 }, { "auxiliary_loss_clip": 0.01107549, "auxiliary_loss_mlp": 0.01038612, "balance_loss_clip": 1.01609302, "balance_loss_mlp": 1.02917778, "epoch": 0.14231173906508343, "flos": 10560667161600.0, "grad_norm": 2.2568984621358297, "language_loss": 0.80072278, "learning_rate": 3.8036904467315196e-06, "loss": 0.82218438, "num_input_tokens_seen": 51336045, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.78125, "step": 2367, "time_per_iteration": 2.36179256439209 }, { "auxiliary_loss_clip": 0.01115014, "auxiliary_loss_mlp": 0.0105425, "balance_loss_clip": 1.03032434, "balance_loss_mlp": 1.03221178, "epoch": 0.1423718623177514, "flos": 28359773239680.0, "grad_norm": 2.2982741802170543, "language_loss": 0.82969856, "learning_rate": 3.8035271729175366e-06, "loss": 0.8513912, "num_input_tokens_seen": 51357030, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.828125, "step": 2368, "time_per_iteration": 2.4400994777679443 }, { "auxiliary_loss_clip": 0.01111815, "auxiliary_loss_mlp": 0.01049996, "balance_loss_clip": 1.02657127, "balance_loss_mlp": 1.03188169, "epoch": 0.14243198557041936, "flos": 19353228739200.0, "grad_norm": 2.225072538149742, "language_loss": 0.86660296, "learning_rate": 3.803363834740308e-06, "loss": 0.88822114, "num_input_tokens_seen": 51374890, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.796875, "step": 2369, "time_per_iteration": 2.3709802627563477 }, { "auxiliary_loss_clip": 0.01111436, "auxiliary_loss_mlp": 0.0104525, "balance_loss_clip": 1.021909, "balance_loss_mlp": 1.0288223, "epoch": 0.14249210882308733, "flos": 28805508967680.0, "grad_norm": 1.5397197418609387, "language_loss": 0.7586726, "learning_rate": 3.8032004322056627e-06, "loss": 0.7802394, "num_input_tokens_seen": 51398100, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.828125, "step": 2370, "time_per_iteration": 2.4590070247650146 }, { "auxiliary_loss_clip": 0.01111055, "auxiliary_loss_mlp": 0.01046763, "balance_loss_clip": 1.02507842, "balance_loss_mlp": 1.03099585, "epoch": 0.1425522320757553, "flos": 21833947416960.0, "grad_norm": 1.7583329430782595, "language_loss": 0.83001393, "learning_rate": 3.8030369653194326e-06, "loss": 0.85159212, "num_input_tokens_seen": 51418745, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.80078125, "step": 2371, "time_per_iteration": 2.3887088298797607 }, { "auxiliary_loss_clip": 0.01113674, "auxiliary_loss_mlp": 0.0104464, "balance_loss_clip": 1.02160835, "balance_loss_mlp": 1.03213429, "epoch": 0.14261235532842326, "flos": 17310495467520.0, "grad_norm": 1.9612660560625004, "language_loss": 0.82770348, "learning_rate": 3.802873434087451e-06, "loss": 0.84928668, "num_input_tokens_seen": 51437455, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.81640625, "step": 2372, "time_per_iteration": 2.3595197200775146 }, { "auxiliary_loss_clip": 0.01111961, "auxiliary_loss_mlp": 0.0104377, "balance_loss_clip": 1.02153802, "balance_loss_mlp": 1.03225899, "epoch": 0.14267247858109125, "flos": 18805755709440.0, "grad_norm": 3.171101260252954, "language_loss": 0.84952039, "learning_rate": 3.8027098385155546e-06, "loss": 0.87107772, "num_input_tokens_seen": 51455710, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.796875, "step": 2373, "time_per_iteration": 2.3554599285125732 }, { "auxiliary_loss_clip": 0.01108555, "auxiliary_loss_mlp": 0.01047167, "balance_loss_clip": 1.0272826, "balance_loss_mlp": 1.02993953, "epoch": 0.14273260183375922, "flos": 11358258238080.0, "grad_norm": 1.9497593446538968, "language_loss": 0.85978901, "learning_rate": 3.802546178609581e-06, "loss": 0.88134623, "num_input_tokens_seen": 51471270, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.78515625, "step": 2374, "time_per_iteration": 2.3471994400024414 }, { "auxiliary_loss_clip": 0.01116023, "auxiliary_loss_mlp": 0.01048151, "balance_loss_clip": 1.02371335, "balance_loss_mlp": 1.03108776, "epoch": 0.14279272508642718, "flos": 27566336615040.0, "grad_norm": 1.6567888506182693, "language_loss": 0.79175425, "learning_rate": 3.8023824543753706e-06, "loss": 0.81339598, "num_input_tokens_seen": 51492705, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.8515625, "step": 2375, "time_per_iteration": 2.4538514614105225 }, { "auxiliary_loss_clip": 0.01114544, "auxiliary_loss_mlp": 0.01052575, "balance_loss_clip": 1.0297699, "balance_loss_mlp": 1.03305233, "epoch": 0.14285284833909515, "flos": 16251649620480.0, "grad_norm": 2.5280824056824294, "language_loss": 0.76490855, "learning_rate": 3.802218665818767e-06, "loss": 0.78657973, "num_input_tokens_seen": 51510780, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.81640625, "step": 2376, "time_per_iteration": 2.356001138687134 }, { "auxiliary_loss_clip": 0.01111805, "auxiliary_loss_mlp": 0.01041119, "balance_loss_clip": 1.0195061, "balance_loss_mlp": 1.03157842, "epoch": 0.1429129715917631, "flos": 19754590262400.0, "grad_norm": 1.822378925866651, "language_loss": 0.93101025, "learning_rate": 3.802054812945615e-06, "loss": 0.95253944, "num_input_tokens_seen": 51531400, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.8046875, "step": 2377, "time_per_iteration": 2.403977870941162 }, { "auxiliary_loss_clip": 0.01109632, "auxiliary_loss_mlp": 0.0104081, "balance_loss_clip": 1.01652718, "balance_loss_mlp": 1.02863622, "epoch": 0.14297309484443108, "flos": 21136173517440.0, "grad_norm": 2.0131222186116404, "language_loss": 0.91564405, "learning_rate": 3.801890895761762e-06, "loss": 0.93714845, "num_input_tokens_seen": 51548215, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.8125, "step": 2378, "time_per_iteration": 2.373382091522217 }, { "auxiliary_loss_clip": 0.01113144, "auxiliary_loss_mlp": 0.01038751, "balance_loss_clip": 1.0161612, "balance_loss_mlp": 1.02985239, "epoch": 0.14303321809709904, "flos": 23585539927680.0, "grad_norm": 1.6098071087419281, "language_loss": 0.73419136, "learning_rate": 3.8017269142730584e-06, "loss": 0.75571024, "num_input_tokens_seen": 51566820, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.83203125, "step": 2379, "time_per_iteration": 2.3976776599884033 }, { "auxiliary_loss_clip": 0.01108895, "auxiliary_loss_mlp": 0.01051303, "balance_loss_clip": 1.02926159, "balance_loss_mlp": 1.0289185, "epoch": 0.14309334134976703, "flos": 15887365827840.0, "grad_norm": 1.961125634825258, "language_loss": 0.78647882, "learning_rate": 3.801562868485355e-06, "loss": 0.80808079, "num_input_tokens_seen": 51585075, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.80078125, "step": 2380, "time_per_iteration": 2.3603386878967285 }, { "auxiliary_loss_clip": 0.01115999, "auxiliary_loss_mlp": 0.01042334, "balance_loss_clip": 1.0200417, "balance_loss_mlp": 1.03406525, "epoch": 0.143153464602435, "flos": 16324687918080.0, "grad_norm": 2.1120644519583975, "language_loss": 0.88181722, "learning_rate": 3.801398758404508e-06, "loss": 0.90340054, "num_input_tokens_seen": 51603185, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.8203125, "step": 2381, "time_per_iteration": 2.377686023712158 }, { "auxiliary_loss_clip": 0.01109959, "auxiliary_loss_mlp": 0.01041204, "balance_loss_clip": 1.0184474, "balance_loss_mlp": 1.0317328, "epoch": 0.14321358785510296, "flos": 17091136195200.0, "grad_norm": 2.218149750101264, "language_loss": 0.76782626, "learning_rate": 3.801234584036372e-06, "loss": 0.78933787, "num_input_tokens_seen": 51620880, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.78125, "step": 2382, "time_per_iteration": 2.4079976081848145 }, { "auxiliary_loss_clip": 0.01110099, "auxiliary_loss_mlp": 0.01046025, "balance_loss_clip": 1.02367353, "balance_loss_mlp": 1.02952552, "epoch": 0.14327371110777093, "flos": 26321718090240.0, "grad_norm": 2.2360123700006933, "language_loss": 0.76984197, "learning_rate": 3.801070345386808e-06, "loss": 0.79140317, "num_input_tokens_seen": 51640170, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.8046875, "step": 2383, "time_per_iteration": 2.426485776901245 }, { "auxiliary_loss_clip": 0.01112413, "auxiliary_loss_mlp": 0.01048505, "balance_loss_clip": 1.02380538, "balance_loss_mlp": 1.02988279, "epoch": 0.1433338343604389, "flos": 18075512378880.0, "grad_norm": 2.3204216584220494, "language_loss": 0.87647116, "learning_rate": 3.8009060424616757e-06, "loss": 0.89808035, "num_input_tokens_seen": 51656580, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 0.82421875, "step": 2384, "time_per_iteration": 2.3518261909484863 }, { "auxiliary_loss_clip": 0.01115697, "auxiliary_loss_mlp": 0.01045557, "balance_loss_clip": 1.02165604, "balance_loss_mlp": 1.03101289, "epoch": 0.14339395761310686, "flos": 15521895048960.0, "grad_norm": 2.254275462304245, "language_loss": 0.79344857, "learning_rate": 3.800741675266839e-06, "loss": 0.81506115, "num_input_tokens_seen": 51674645, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 0.84375, "step": 2385, "time_per_iteration": 3.7864675521850586 }, { "auxiliary_loss_clip": 0.01109576, "auxiliary_loss_mlp": 0.01043029, "balance_loss_clip": 1.0214045, "balance_loss_mlp": 1.03019714, "epoch": 0.14345408086577485, "flos": 28547500953600.0, "grad_norm": 1.680801224843066, "language_loss": 0.75024277, "learning_rate": 3.8005772438081645e-06, "loss": 0.77176881, "num_input_tokens_seen": 51695770, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.79296875, "step": 2386, "time_per_iteration": 3.8261818885803223 }, { "auxiliary_loss_clip": 0.01111145, "auxiliary_loss_mlp": 0.01040607, "balance_loss_clip": 1.01887536, "balance_loss_mlp": 1.03130329, "epoch": 0.14351420411844282, "flos": 20229024994560.0, "grad_norm": 2.069877764197071, "language_loss": 0.78709936, "learning_rate": 3.80041274809152e-06, "loss": 0.80861688, "num_input_tokens_seen": 51714165, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.796875, "step": 2387, "time_per_iteration": 2.3673291206359863 }, { "auxiliary_loss_clip": 0.01109196, "auxiliary_loss_mlp": 0.01045684, "balance_loss_clip": 1.02303433, "balance_loss_mlp": 1.02931619, "epoch": 0.14357432737111078, "flos": 19864008063360.0, "grad_norm": 2.107675298355465, "language_loss": 0.82349843, "learning_rate": 3.8002481881227753e-06, "loss": 0.84504724, "num_input_tokens_seen": 51734440, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.796875, "step": 2388, "time_per_iteration": 2.386894702911377 }, { "auxiliary_loss_clip": 0.01111769, "auxiliary_loss_mlp": 0.01042883, "balance_loss_clip": 1.02164018, "balance_loss_mlp": 1.03092527, "epoch": 0.14363445062377875, "flos": 28255557231360.0, "grad_norm": 2.801989469982729, "language_loss": 0.82503819, "learning_rate": 3.8000835639078038e-06, "loss": 0.8465848, "num_input_tokens_seen": 51753730, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.80859375, "step": 2389, "time_per_iteration": 5.149309873580933 }, { "auxiliary_loss_clip": 0.01113311, "auxiliary_loss_mlp": 0.01045806, "balance_loss_clip": 1.02178502, "balance_loss_mlp": 1.03083026, "epoch": 0.1436945738764467, "flos": 18185698229760.0, "grad_norm": 1.9683773180119561, "language_loss": 0.83158261, "learning_rate": 3.79991887545248e-06, "loss": 0.85317379, "num_input_tokens_seen": 51771195, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.82421875, "step": 2390, "time_per_iteration": 2.348984479904175 }, { "auxiliary_loss_clip": 0.01108689, "auxiliary_loss_mlp": 0.01044275, "balance_loss_clip": 1.02272224, "balance_loss_mlp": 1.02935779, "epoch": 0.14375469712911468, "flos": 27306687767040.0, "grad_norm": 1.530518587021261, "language_loss": 0.74943447, "learning_rate": 3.799754122762682e-06, "loss": 0.77096415, "num_input_tokens_seen": 51792290, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.79296875, "step": 2391, "time_per_iteration": 2.4368858337402344 }, { "auxiliary_loss_clip": 0.0102576, "auxiliary_loss_mlp": 0.01002298, "balance_loss_clip": 0.99986637, "balance_loss_mlp": 1.00419617, "epoch": 0.14381482038178264, "flos": 56888559838080.0, "grad_norm": 1.0154385386077425, "language_loss": 0.61786532, "learning_rate": 3.7995893058442886e-06, "loss": 0.63814592, "num_input_tokens_seen": 51843675, "router_z_loss_clip": 0.02429199, "router_z_loss_mlp": 0.21582031, "step": 2392, "time_per_iteration": 2.861618995666504 }, { "auxiliary_loss_clip": 0.01111459, "auxiliary_loss_mlp": 0.01049624, "balance_loss_clip": 1.02554417, "balance_loss_mlp": 1.02828789, "epoch": 0.14387494363445064, "flos": 14281326241920.0, "grad_norm": 2.069828487465517, "language_loss": 0.76887888, "learning_rate": 3.7994244247031814e-06, "loss": 0.79048973, "num_input_tokens_seen": 51860285, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.83203125, "step": 2393, "time_per_iteration": 2.358253002166748 }, { "auxiliary_loss_clip": 0.01112962, "auxiliary_loss_mlp": 0.01042185, "balance_loss_clip": 1.02052498, "balance_loss_mlp": 1.03079224, "epoch": 0.1439350668871186, "flos": 26760262078080.0, "grad_norm": 1.8514424364669433, "language_loss": 0.76460016, "learning_rate": 3.799259479345246e-06, "loss": 0.78615165, "num_input_tokens_seen": 51880105, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.8203125, "step": 2394, "time_per_iteration": 2.4324076175689697 }, { "auxiliary_loss_clip": 0.01108887, "auxiliary_loss_mlp": 0.01045604, "balance_loss_clip": 1.02238178, "balance_loss_mlp": 1.02888703, "epoch": 0.14399519013978657, "flos": 40698392855040.0, "grad_norm": 1.6178149714271428, "language_loss": 0.86226803, "learning_rate": 3.799094469776367e-06, "loss": 0.8838129, "num_input_tokens_seen": 51905175, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.80078125, "step": 2395, "time_per_iteration": 2.5615274906158447 }, { "auxiliary_loss_clip": 0.01108712, "auxiliary_loss_mlp": 0.010458, "balance_loss_clip": 1.02436662, "balance_loss_mlp": 1.03135109, "epoch": 0.14405531339245453, "flos": 20556510347520.0, "grad_norm": 1.5520876876903358, "language_loss": 0.82945615, "learning_rate": 3.7989293960024353e-06, "loss": 0.85100126, "num_input_tokens_seen": 51924490, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.7734375, "step": 2396, "time_per_iteration": 2.386688232421875 }, { "auxiliary_loss_clip": 0.01106862, "auxiliary_loss_mlp": 0.01039103, "balance_loss_clip": 1.01921928, "balance_loss_mlp": 1.02991748, "epoch": 0.1441154366451225, "flos": 19571924695680.0, "grad_norm": 2.649614566155054, "language_loss": 0.82483745, "learning_rate": 3.79876425802934e-06, "loss": 0.84629709, "num_input_tokens_seen": 51940490, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.76953125, "step": 2397, "time_per_iteration": 2.375566005706787 }, { "auxiliary_loss_clip": 0.0111361, "auxiliary_loss_mlp": 0.01051843, "balance_loss_clip": 1.02984858, "balance_loss_mlp": 1.03093338, "epoch": 0.14417555989779046, "flos": 18514719682560.0, "grad_norm": 1.6649212933750084, "language_loss": 0.79688466, "learning_rate": 3.798599055862976e-06, "loss": 0.8185392, "num_input_tokens_seen": 51957910, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.828125, "step": 2398, "time_per_iteration": 2.3601315021514893 }, { "auxiliary_loss_clip": 0.01106993, "auxiliary_loss_mlp": 0.01051333, "balance_loss_clip": 1.02938616, "balance_loss_mlp": 1.03000951, "epoch": 0.14423568315045843, "flos": 26030472595200.0, "grad_norm": 10.976560248022666, "language_loss": 0.64585006, "learning_rate": 3.798433789509238e-06, "loss": 0.66743332, "num_input_tokens_seen": 51978010, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.76953125, "step": 2399, "time_per_iteration": 2.4182305335998535 }, { "auxiliary_loss_clip": 0.01110507, "auxiliary_loss_mlp": 0.01045423, "balance_loss_clip": 1.02336943, "balance_loss_mlp": 1.03213882, "epoch": 0.14429580640312642, "flos": 21287661373440.0, "grad_norm": 2.09594890333685, "language_loss": 0.82169539, "learning_rate": 3.798268458974024e-06, "loss": 0.84325469, "num_input_tokens_seen": 51998515, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.78515625, "step": 2400, "time_per_iteration": 2.3954548835754395 }, { "auxiliary_loss_clip": 0.01112839, "auxiliary_loss_mlp": 0.01048561, "balance_loss_clip": 1.02368224, "balance_loss_mlp": 1.03122401, "epoch": 0.14435592965579438, "flos": 25626737099520.0, "grad_norm": 2.025461941102729, "language_loss": 0.74472535, "learning_rate": 3.7981030642632348e-06, "loss": 0.76633936, "num_input_tokens_seen": 52019270, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 0.81640625, "step": 2401, "time_per_iteration": 2.4465906620025635 }, { "auxiliary_loss_clip": 0.01109409, "auxiliary_loss_mlp": 0.01038422, "balance_loss_clip": 1.01783419, "balance_loss_mlp": 1.02983093, "epoch": 0.14441605290846235, "flos": 22963981259520.0, "grad_norm": 1.8874126218600147, "language_loss": 0.8074652, "learning_rate": 3.797937605382772e-06, "loss": 0.82894349, "num_input_tokens_seen": 52039315, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.796875, "step": 2402, "time_per_iteration": 2.3857922554016113 }, { "auxiliary_loss_clip": 0.01109774, "auxiliary_loss_mlp": 0.01045489, "balance_loss_clip": 1.02326858, "balance_loss_mlp": 1.03010798, "epoch": 0.14447617616113032, "flos": 17346700414080.0, "grad_norm": 2.4552598884483037, "language_loss": 0.84383583, "learning_rate": 3.7977720823385413e-06, "loss": 0.86538851, "num_input_tokens_seen": 52056555, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.796875, "step": 2403, "time_per_iteration": 2.3651437759399414 }, { "auxiliary_loss_clip": 0.01108275, "auxiliary_loss_mlp": 0.01047624, "balance_loss_clip": 1.02614224, "balance_loss_mlp": 1.02868319, "epoch": 0.14453629941379828, "flos": 24059066964480.0, "grad_norm": 1.9221461120669485, "language_loss": 0.69886154, "learning_rate": 3.797606495136449e-06, "loss": 0.72042048, "num_input_tokens_seen": 52075800, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.796875, "step": 2404, "time_per_iteration": 2.394455909729004 }, { "auxiliary_loss_clip": 0.01105719, "auxiliary_loss_mlp": 0.01043794, "balance_loss_clip": 1.02257442, "balance_loss_mlp": 1.02929091, "epoch": 0.14459642266646625, "flos": 14428659646080.0, "grad_norm": 1.97815403774669, "language_loss": 0.73047113, "learning_rate": 3.7974408437824055e-06, "loss": 0.75196624, "num_input_tokens_seen": 52092585, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.765625, "step": 2405, "time_per_iteration": 2.354947566986084 }, { "auxiliary_loss_clip": 0.01106846, "auxiliary_loss_mlp": 0.01041665, "balance_loss_clip": 1.0204097, "balance_loss_mlp": 1.03113937, "epoch": 0.14465654591913424, "flos": 9866314575360.0, "grad_norm": 4.372737942354544, "language_loss": 0.73072457, "learning_rate": 3.7972751282823216e-06, "loss": 0.7522096, "num_input_tokens_seen": 52108990, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.7578125, "step": 2406, "time_per_iteration": 2.3535666465759277 }, { "auxiliary_loss_clip": 0.01110209, "auxiliary_loss_mlp": 0.01046813, "balance_loss_clip": 1.02440178, "balance_loss_mlp": 1.03055358, "epoch": 0.1447166691718022, "flos": 24971766393600.0, "grad_norm": 2.2657112076268757, "language_loss": 0.75740147, "learning_rate": 3.797109348642111e-06, "loss": 0.77897167, "num_input_tokens_seen": 52125385, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.796875, "step": 2407, "time_per_iteration": 2.403923273086548 }, { "auxiliary_loss_clip": 0.01107285, "auxiliary_loss_mlp": 0.01037676, "balance_loss_clip": 1.01682675, "balance_loss_mlp": 1.02851439, "epoch": 0.14477679242447017, "flos": 21906950803200.0, "grad_norm": 1.4982171274409408, "language_loss": 0.79570168, "learning_rate": 3.796943504867691e-06, "loss": 0.81715131, "num_input_tokens_seen": 52144985, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.78515625, "step": 2408, "time_per_iteration": 2.3764305114746094 }, { "auxiliary_loss_clip": 0.01110048, "auxiliary_loss_mlp": 0.01046355, "balance_loss_clip": 1.0226922, "balance_loss_mlp": 1.0321908, "epoch": 0.14483691567713813, "flos": 20739699584640.0, "grad_norm": 25.981433853447278, "language_loss": 0.82397455, "learning_rate": 3.7967775969649796e-06, "loss": 0.84553862, "num_input_tokens_seen": 52163885, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.78125, "step": 2409, "time_per_iteration": 2.367091178894043 }, { "auxiliary_loss_clip": 0.01109693, "auxiliary_loss_mlp": 0.01047852, "balance_loss_clip": 1.02637005, "balance_loss_mlp": 1.03146958, "epoch": 0.1448970389298061, "flos": 35406258301440.0, "grad_norm": 1.7803194277442216, "language_loss": 0.74579203, "learning_rate": 3.7966116249398974e-06, "loss": 0.76736754, "num_input_tokens_seen": 52184325, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.78125, "step": 2410, "time_per_iteration": 2.5116896629333496 }, { "auxiliary_loss_clip": 0.01107985, "auxiliary_loss_mlp": 0.01041762, "balance_loss_clip": 1.02059031, "balance_loss_mlp": 1.02840614, "epoch": 0.14495716218247406, "flos": 15413454766080.0, "grad_norm": 1.8276983278577725, "language_loss": 0.8123709, "learning_rate": 3.7964455887983675e-06, "loss": 0.83386838, "num_input_tokens_seen": 52202740, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.796875, "step": 2411, "time_per_iteration": 2.356724739074707 }, { "auxiliary_loss_clip": 0.01108443, "auxiliary_loss_mlp": 0.01043607, "balance_loss_clip": 1.0213623, "balance_loss_mlp": 1.03147197, "epoch": 0.14501728543514203, "flos": 33691813344000.0, "grad_norm": 2.289403854800444, "language_loss": 0.70229125, "learning_rate": 3.7962794885463165e-06, "loss": 0.72381175, "num_input_tokens_seen": 52223100, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.76953125, "step": 2412, "time_per_iteration": 2.4828953742980957 }, { "auxiliary_loss_clip": 0.01109675, "auxiliary_loss_mlp": 0.01037952, "balance_loss_clip": 1.0168761, "balance_loss_mlp": 1.03181338, "epoch": 0.14507740868781002, "flos": 15595212637440.0, "grad_norm": 2.979334482442582, "language_loss": 0.76595402, "learning_rate": 3.7961133241896706e-06, "loss": 0.78743023, "num_input_tokens_seen": 52239690, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.78125, "step": 2413, "time_per_iteration": 2.352808713912964 }, { "auxiliary_loss_clip": 0.01028559, "auxiliary_loss_mlp": 0.01003957, "balance_loss_clip": 1.00168025, "balance_loss_mlp": 1.00727987, "epoch": 0.145137531940478, "flos": 66672026023680.0, "grad_norm": 0.8823553427400396, "language_loss": 0.58824027, "learning_rate": 3.79594709573436e-06, "loss": 0.60856533, "num_input_tokens_seen": 52296705, "router_z_loss_clip": 0.02282715, "router_z_loss_mlp": 0.21289062, "step": 2414, "time_per_iteration": 2.949816942214966 }, { "auxiliary_loss_clip": 0.01026435, "auxiliary_loss_mlp": 0.01006749, "balance_loss_clip": 1.00451934, "balance_loss_mlp": 1.00521815, "epoch": 0.14519765519314595, "flos": 67518041022720.0, "grad_norm": 0.8364423695325559, "language_loss": 0.62246674, "learning_rate": 3.7957808031863173e-06, "loss": 0.64279854, "num_input_tokens_seen": 52361830, "router_z_loss_clip": 0.02233887, "router_z_loss_mlp": 0.21191406, "step": 2415, "time_per_iteration": 3.0504634380340576 }, { "auxiliary_loss_clip": 0.01106679, "auxiliary_loss_mlp": 0.01037006, "balance_loss_clip": 1.01550126, "balance_loss_mlp": 1.02929401, "epoch": 0.14525777844581392, "flos": 17198040378240.0, "grad_norm": 1.999221124672743, "language_loss": 0.71876603, "learning_rate": 3.7956144465514775e-06, "loss": 0.7402029, "num_input_tokens_seen": 52379420, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.7734375, "step": 2416, "time_per_iteration": 2.359178304672241 }, { "auxiliary_loss_clip": 0.01025716, "auxiliary_loss_mlp": 0.01003898, "balance_loss_clip": 1.00165713, "balance_loss_mlp": 1.00491476, "epoch": 0.14531790169848188, "flos": 65401152289920.0, "grad_norm": 0.7093771423964166, "language_loss": 0.60392392, "learning_rate": 3.7954480258357765e-06, "loss": 0.62422007, "num_input_tokens_seen": 52446290, "router_z_loss_clip": 0.02246094, "router_z_loss_mlp": 0.20800781, "step": 2417, "time_per_iteration": 3.0886611938476562 }, { "auxiliary_loss_clip": 0.01111104, "auxiliary_loss_mlp": 0.01048751, "balance_loss_clip": 1.02648282, "balance_loss_mlp": 1.02903223, "epoch": 0.14537802495114985, "flos": 32561081274240.0, "grad_norm": 2.5029232264615304, "language_loss": 0.78740788, "learning_rate": 3.7952815410451542e-06, "loss": 0.80900639, "num_input_tokens_seen": 52467295, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.8203125, "step": 2418, "time_per_iteration": 2.4675533771514893 }, { "auxiliary_loss_clip": 0.01106791, "auxiliary_loss_mlp": 0.0103804, "balance_loss_clip": 1.01740527, "balance_loss_mlp": 1.02998948, "epoch": 0.1454381482038178, "flos": 20225743326720.0, "grad_norm": 2.3543541080721186, "language_loss": 0.71558362, "learning_rate": 3.7951149921855515e-06, "loss": 0.73703194, "num_input_tokens_seen": 52487295, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.765625, "step": 2419, "time_per_iteration": 2.4007863998413086 }, { "auxiliary_loss_clip": 0.01107661, "auxiliary_loss_mlp": 0.01041494, "balance_loss_clip": 1.01862967, "balance_loss_mlp": 1.03081453, "epoch": 0.1454982714564858, "flos": 22892025214080.0, "grad_norm": 2.509133605470454, "language_loss": 0.89308, "learning_rate": 3.794948379262913e-06, "loss": 0.91457152, "num_input_tokens_seen": 52504220, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.765625, "step": 2420, "time_per_iteration": 2.3839590549468994 }, { "auxiliary_loss_clip": 0.01108402, "auxiliary_loss_mlp": 0.01040117, "balance_loss_clip": 1.01805139, "balance_loss_mlp": 1.03007388, "epoch": 0.14555839470915377, "flos": 20228815526400.0, "grad_norm": 1.9597872167404442, "language_loss": 0.82786453, "learning_rate": 3.794781702283183e-06, "loss": 0.84934974, "num_input_tokens_seen": 52521900, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.78125, "step": 2421, "time_per_iteration": 2.3867626190185547 }, { "auxiliary_loss_clip": 0.01110605, "auxiliary_loss_mlp": 0.01042945, "balance_loss_clip": 1.0207957, "balance_loss_mlp": 1.03116322, "epoch": 0.14561851796182174, "flos": 22235204206080.0, "grad_norm": 1.6068226050987182, "language_loss": 0.81600267, "learning_rate": 3.7946149612523116e-06, "loss": 0.83753818, "num_input_tokens_seen": 52540495, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.79296875, "step": 2422, "time_per_iteration": 2.388953447341919 }, { "auxiliary_loss_clip": 0.01027698, "auxiliary_loss_mlp": 0.01003126, "balance_loss_clip": 1.00078976, "balance_loss_mlp": 1.00744081, "epoch": 0.1456786412144897, "flos": 52633624222080.0, "grad_norm": 0.9167793141958676, "language_loss": 0.63315821, "learning_rate": 3.794448156176248e-06, "loss": 0.65346646, "num_input_tokens_seen": 52603305, "router_z_loss_clip": 0.02331543, "router_z_loss_mlp": 0.203125, "step": 2423, "time_per_iteration": 3.024770975112915 }, { "auxiliary_loss_clip": 0.01109042, "auxiliary_loss_mlp": 0.01039125, "balance_loss_clip": 1.01955104, "balance_loss_mlp": 1.0321672, "epoch": 0.14573876446715767, "flos": 23220557907840.0, "grad_norm": 3.561868481747033, "language_loss": 0.82309031, "learning_rate": 3.794281287060946e-06, "loss": 0.84457195, "num_input_tokens_seen": 52623435, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.76953125, "step": 2424, "time_per_iteration": 3.805325984954834 }, { "auxiliary_loss_clip": 0.01108945, "auxiliary_loss_mlp": 0.01040561, "balance_loss_clip": 1.01880586, "balance_loss_mlp": 1.03246284, "epoch": 0.14579888771982563, "flos": 18113393070720.0, "grad_norm": 2.3867775837286835, "language_loss": 0.78662455, "learning_rate": 3.7941143539123596e-06, "loss": 0.80811965, "num_input_tokens_seen": 52642255, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.765625, "step": 2425, "time_per_iteration": 2.3818471431732178 }, { "auxiliary_loss_clip": 0.011079, "auxiliary_loss_mlp": 0.01040786, "balance_loss_clip": 1.01992452, "balance_loss_mlp": 1.03184569, "epoch": 0.14585901097249362, "flos": 23000046560640.0, "grad_norm": 2.004185816551749, "language_loss": 0.8373462, "learning_rate": 3.7939473567364473e-06, "loss": 0.85883307, "num_input_tokens_seen": 52658700, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.76171875, "step": 2426, "time_per_iteration": 3.758450746536255 }, { "auxiliary_loss_clip": 0.01106852, "auxiliary_loss_mlp": 0.01039837, "balance_loss_clip": 1.01924932, "balance_loss_mlp": 1.03191352, "epoch": 0.1459191342251616, "flos": 21907579207680.0, "grad_norm": 1.9805063390766124, "language_loss": 0.87306172, "learning_rate": 3.793780295539169e-06, "loss": 0.89452857, "num_input_tokens_seen": 52678140, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.74609375, "step": 2427, "time_per_iteration": 3.782193899154663 }, { "auxiliary_loss_clip": 0.01112837, "auxiliary_loss_mlp": 0.01038514, "balance_loss_clip": 1.01620984, "balance_loss_mlp": 1.03108454, "epoch": 0.14597925747782955, "flos": 14974631487360.0, "grad_norm": 2.3468969024196586, "language_loss": 0.66859877, "learning_rate": 3.793613170326485e-06, "loss": 0.69011229, "num_input_tokens_seen": 52696825, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.8203125, "step": 2428, "time_per_iteration": 3.758338689804077 }, { "auxiliary_loss_clip": 0.01106052, "auxiliary_loss_mlp": 0.01040569, "balance_loss_clip": 1.01875365, "balance_loss_mlp": 1.02963817, "epoch": 0.14603938073049752, "flos": 21067848253440.0, "grad_norm": 2.593160435012226, "language_loss": 0.83403075, "learning_rate": 3.793445981104362e-06, "loss": 0.855497, "num_input_tokens_seen": 52715125, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.765625, "step": 2429, "time_per_iteration": 2.4017221927642822 }, { "auxiliary_loss_clip": 0.0110459, "auxiliary_loss_mlp": 0.01034364, "balance_loss_clip": 1.01366925, "balance_loss_mlp": 1.02881503, "epoch": 0.14609950398316549, "flos": 19863763683840.0, "grad_norm": 1.7183100895627228, "language_loss": 0.79013276, "learning_rate": 3.7932787278787643e-06, "loss": 0.81152231, "num_input_tokens_seen": 52734015, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.7578125, "step": 2430, "time_per_iteration": 2.3640503883361816 }, { "auxiliary_loss_clip": 0.01108524, "auxiliary_loss_mlp": 0.01045156, "balance_loss_clip": 1.02367413, "balance_loss_mlp": 1.0301621, "epoch": 0.14615962723583345, "flos": 22417765038720.0, "grad_norm": 2.056080596525592, "language_loss": 0.82878721, "learning_rate": 3.7931114106556618e-06, "loss": 0.85032403, "num_input_tokens_seen": 52753025, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.78515625, "step": 2431, "time_per_iteration": 2.3871424198150635 }, { "auxiliary_loss_clip": 0.01110717, "auxiliary_loss_mlp": 0.0104409, "balance_loss_clip": 1.02068925, "balance_loss_mlp": 1.03180575, "epoch": 0.14621975048850142, "flos": 22345145677440.0, "grad_norm": 1.8340758882246042, "language_loss": 0.78707421, "learning_rate": 3.7929440294410256e-06, "loss": 0.80862224, "num_input_tokens_seen": 52773420, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 0.7890625, "step": 2432, "time_per_iteration": 2.3856136798858643 }, { "auxiliary_loss_clip": 0.01104668, "auxiliary_loss_mlp": 0.01045863, "balance_loss_clip": 1.02277184, "balance_loss_mlp": 1.02831507, "epoch": 0.1462798737411694, "flos": 24388018594560.0, "grad_norm": 2.069365885011866, "language_loss": 0.79872191, "learning_rate": 3.792776584240829e-06, "loss": 0.82022727, "num_input_tokens_seen": 52792870, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.765625, "step": 2433, "time_per_iteration": 2.4183566570281982 }, { "auxiliary_loss_clip": 0.01104393, "auxiliary_loss_mlp": 0.01039623, "balance_loss_clip": 1.01834464, "balance_loss_mlp": 1.03108001, "epoch": 0.14633999699383737, "flos": 19243671292800.0, "grad_norm": 1.8863344199181091, "language_loss": 0.78056562, "learning_rate": 3.7926090750610477e-06, "loss": 0.80200571, "num_input_tokens_seen": 52811615, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.734375, "step": 2434, "time_per_iteration": 2.362306594848633 }, { "auxiliary_loss_clip": 0.01023121, "auxiliary_loss_mlp": 0.01002245, "balance_loss_clip": 0.9998247, "balance_loss_mlp": 1.00283957, "epoch": 0.14640012024650534, "flos": 62657468184960.0, "grad_norm": 0.856942128852628, "language_loss": 0.58509767, "learning_rate": 3.7924415019076593e-06, "loss": 0.60535133, "num_input_tokens_seen": 52873230, "router_z_loss_clip": 0.02416992, "router_z_loss_mlp": 0.203125, "step": 2435, "time_per_iteration": 2.991661787033081 }, { "auxiliary_loss_clip": 0.01104708, "auxiliary_loss_mlp": 0.01040238, "balance_loss_clip": 1.01990104, "balance_loss_mlp": 1.02923751, "epoch": 0.1464602434991733, "flos": 12275426321280.0, "grad_norm": 2.1762567484660567, "language_loss": 0.88035542, "learning_rate": 3.7922738647866447e-06, "loss": 0.90180486, "num_input_tokens_seen": 52889325, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.75390625, "step": 2436, "time_per_iteration": 2.359917640686035 }, { "auxiliary_loss_clip": 0.01109498, "auxiliary_loss_mlp": 0.01038005, "balance_loss_clip": 1.01778746, "balance_loss_mlp": 1.03151584, "epoch": 0.14652036675184127, "flos": 20921282899200.0, "grad_norm": 2.0799352703185243, "language_loss": 0.74614632, "learning_rate": 3.792106163703986e-06, "loss": 0.7676214, "num_input_tokens_seen": 52909705, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.78125, "step": 2437, "time_per_iteration": 2.401500701904297 }, { "auxiliary_loss_clip": 0.01107899, "auxiliary_loss_mlp": 0.01047454, "balance_loss_clip": 1.02295673, "balance_loss_mlp": 1.02991128, "epoch": 0.14658049000450923, "flos": 27702603118080.0, "grad_norm": 2.836756276761295, "language_loss": 0.73944908, "learning_rate": 3.791938398665668e-06, "loss": 0.76100266, "num_input_tokens_seen": 52930300, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 0.78125, "step": 2438, "time_per_iteration": 2.4427995681762695 }, { "auxiliary_loss_clip": 0.01107115, "auxiliary_loss_mlp": 0.01039244, "balance_loss_clip": 1.01975298, "balance_loss_mlp": 1.03227019, "epoch": 0.14664061325717723, "flos": 24935351978880.0, "grad_norm": 2.120152516942651, "language_loss": 0.74749863, "learning_rate": 3.7917705696776786e-06, "loss": 0.76896226, "num_input_tokens_seen": 52949955, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.74609375, "step": 2439, "time_per_iteration": 2.42877459526062 }, { "auxiliary_loss_clip": 0.01105312, "auxiliary_loss_mlp": 0.01045753, "balance_loss_clip": 1.02427197, "balance_loss_mlp": 1.0307641, "epoch": 0.1467007365098452, "flos": 40296053813760.0, "grad_norm": 1.8759258491755313, "language_loss": 0.74690181, "learning_rate": 3.7916026767460067e-06, "loss": 0.76841247, "num_input_tokens_seen": 52972905, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.74609375, "step": 2440, "time_per_iteration": 2.543079376220703 }, { "auxiliary_loss_clip": 0.01104207, "auxiliary_loss_mlp": 0.01042874, "balance_loss_clip": 1.02344286, "balance_loss_mlp": 1.03002143, "epoch": 0.14676085976251316, "flos": 26539890376320.0, "grad_norm": 1.5325917258297075, "language_loss": 0.83222544, "learning_rate": 3.791434719876643e-06, "loss": 0.85369635, "num_input_tokens_seen": 52994850, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.7421875, "step": 2441, "time_per_iteration": 2.4438955783843994 }, { "auxiliary_loss_clip": 0.01112085, "auxiliary_loss_mlp": 0.01042917, "balance_loss_clip": 1.01980221, "balance_loss_mlp": 1.03033793, "epoch": 0.14682098301518112, "flos": 23548985867520.0, "grad_norm": 2.030479993575067, "language_loss": 0.72118711, "learning_rate": 3.7912666990755825e-06, "loss": 0.74273717, "num_input_tokens_seen": 53014740, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 0.81640625, "step": 2442, "time_per_iteration": 2.3936851024627686 }, { "auxiliary_loss_clip": 0.0111204, "auxiliary_loss_mlp": 0.01041995, "balance_loss_clip": 1.02048922, "balance_loss_mlp": 1.03172112, "epoch": 0.1468811062678491, "flos": 11650411428480.0, "grad_norm": 2.8190333055980825, "language_loss": 0.81732076, "learning_rate": 3.791098614348821e-06, "loss": 0.83886111, "num_input_tokens_seen": 53029780, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.8046875, "step": 2443, "time_per_iteration": 2.3456430435180664 }, { "auxiliary_loss_clip": 0.01107763, "auxiliary_loss_mlp": 0.01042506, "balance_loss_clip": 1.02182293, "balance_loss_mlp": 1.03075087, "epoch": 0.14694122952051705, "flos": 23001512837760.0, "grad_norm": 1.7544449726777838, "language_loss": 0.82959914, "learning_rate": 3.790930465702358e-06, "loss": 0.85110176, "num_input_tokens_seen": 53048620, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.7734375, "step": 2444, "time_per_iteration": 2.3839950561523438 }, { "auxiliary_loss_clip": 0.01108247, "auxiliary_loss_mlp": 0.01039826, "balance_loss_clip": 1.01883376, "balance_loss_mlp": 1.03084493, "epoch": 0.14700135277318502, "flos": 26501835127680.0, "grad_norm": 1.6459977299007913, "language_loss": 0.70786947, "learning_rate": 3.790762253142193e-06, "loss": 0.72935021, "num_input_tokens_seen": 53070055, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.7734375, "step": 2445, "time_per_iteration": 2.4207019805908203 }, { "auxiliary_loss_clip": 0.01025382, "auxiliary_loss_mlp": 0.01003005, "balance_loss_clip": 1.00078809, "balance_loss_mlp": 1.00529063, "epoch": 0.147061476025853, "flos": 59446366531200.0, "grad_norm": 0.8105754912794436, "language_loss": 0.6305809, "learning_rate": 3.7905939766743296e-06, "loss": 0.65086478, "num_input_tokens_seen": 53126945, "router_z_loss_clip": 0.0222168, "router_z_loss_mlp": 0.20117188, "step": 2446, "time_per_iteration": 2.908247232437134 }, { "auxiliary_loss_clip": 0.01108616, "auxiliary_loss_mlp": 0.01047134, "balance_loss_clip": 1.02487803, "balance_loss_mlp": 1.02990222, "epoch": 0.14712159927852098, "flos": 28329607958400.0, "grad_norm": 1.57403669728487, "language_loss": 0.74675715, "learning_rate": 3.790425636304773e-06, "loss": 0.76831472, "num_input_tokens_seen": 53149130, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.7890625, "step": 2447, "time_per_iteration": 2.44604754447937 }, { "auxiliary_loss_clip": 0.01104429, "auxiliary_loss_mlp": 0.01038763, "balance_loss_clip": 1.01849759, "balance_loss_mlp": 1.02910519, "epoch": 0.14718172253118894, "flos": 27088585303680.0, "grad_norm": 2.1219490618768657, "language_loss": 0.85836643, "learning_rate": 3.7902572320395313e-06, "loss": 0.87979835, "num_input_tokens_seen": 53167120, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.75390625, "step": 2448, "time_per_iteration": 2.4199423789978027 }, { "auxiliary_loss_clip": 0.01023192, "auxiliary_loss_mlp": 0.01010136, "balance_loss_clip": 1.00791848, "balance_loss_mlp": 1.00305104, "epoch": 0.1472418457838569, "flos": 66703587759360.0, "grad_norm": 0.7662270412089266, "language_loss": 0.56839919, "learning_rate": 3.790088763884614e-06, "loss": 0.58873248, "num_input_tokens_seen": 53227945, "router_z_loss_clip": 0.0222168, "router_z_loss_mlp": 0.20117188, "step": 2449, "time_per_iteration": 2.975759983062744 }, { "auxiliary_loss_clip": 0.01106258, "auxiliary_loss_mlp": 0.01040438, "balance_loss_clip": 1.01940989, "balance_loss_mlp": 1.03197777, "epoch": 0.14730196903652487, "flos": 19572553100160.0, "grad_norm": 1.8227387694827935, "language_loss": 0.85032845, "learning_rate": 3.789920231846033e-06, "loss": 0.87179542, "num_input_tokens_seen": 53244615, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.7421875, "step": 2450, "time_per_iteration": 2.370150566101074 }, { "auxiliary_loss_clip": 0.01109353, "auxiliary_loss_mlp": 0.01039365, "balance_loss_clip": 1.0171082, "balance_loss_mlp": 1.03172064, "epoch": 0.14736209228919284, "flos": 16070101217280.0, "grad_norm": 2.111219050806394, "language_loss": 0.74913561, "learning_rate": 3.7897516359298034e-06, "loss": 0.77062279, "num_input_tokens_seen": 53262205, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.7734375, "step": 2451, "time_per_iteration": 2.3699350357055664 }, { "auxiliary_loss_clip": 0.01102903, "auxiliary_loss_mlp": 0.01041274, "balance_loss_clip": 1.02171206, "balance_loss_mlp": 1.02980554, "epoch": 0.1474222155418608, "flos": 23038346188800.0, "grad_norm": 1.5959023829120984, "language_loss": 0.82293332, "learning_rate": 3.7895829761419417e-06, "loss": 0.84437507, "num_input_tokens_seen": 53282445, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.73046875, "step": 2452, "time_per_iteration": 2.4012458324432373 }, { "auxiliary_loss_clip": 0.01104116, "auxiliary_loss_mlp": 0.01037933, "balance_loss_clip": 1.01787043, "balance_loss_mlp": 1.0311923, "epoch": 0.1474823387945288, "flos": 17017713872640.0, "grad_norm": 1.9016400293559543, "language_loss": 0.74453282, "learning_rate": 3.789414252488467e-06, "loss": 0.76595336, "num_input_tokens_seen": 53299060, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.73046875, "step": 2453, "time_per_iteration": 2.3514039516448975 }, { "auxiliary_loss_clip": 0.011081, "auxiliary_loss_mlp": 0.01037958, "balance_loss_clip": 1.01665509, "balance_loss_mlp": 1.03024554, "epoch": 0.14754246204719676, "flos": 17894068709760.0, "grad_norm": 2.005732480324612, "language_loss": 0.75826025, "learning_rate": 3.7892454649754006e-06, "loss": 0.77972078, "num_input_tokens_seen": 53315970, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.77734375, "step": 2454, "time_per_iteration": 2.351875066757202 }, { "auxiliary_loss_clip": 0.011096, "auxiliary_loss_mlp": 0.01038054, "balance_loss_clip": 1.01739502, "balance_loss_mlp": 1.02999902, "epoch": 0.14760258529986472, "flos": 13078254101760.0, "grad_norm": 1.874940662609717, "language_loss": 0.83087826, "learning_rate": 3.789076613608766e-06, "loss": 0.85235482, "num_input_tokens_seen": 53332940, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.796875, "step": 2455, "time_per_iteration": 2.3562862873077393 }, { "auxiliary_loss_clip": 0.01110501, "auxiliary_loss_mlp": 0.01041664, "balance_loss_clip": 1.02000332, "balance_loss_mlp": 1.02969289, "epoch": 0.1476627085525327, "flos": 30805194666240.0, "grad_norm": 2.15398892222712, "language_loss": 0.83898067, "learning_rate": 3.788907698394589e-06, "loss": 0.8605023, "num_input_tokens_seen": 53353295, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.80859375, "step": 2456, "time_per_iteration": 2.4573004245758057 }, { "auxiliary_loss_clip": 0.0110531, "auxiliary_loss_mlp": 0.01033511, "balance_loss_clip": 1.012959, "balance_loss_mlp": 1.02993011, "epoch": 0.14772283180520066, "flos": 21688359580800.0, "grad_norm": 1.8900714922399362, "language_loss": 0.84265876, "learning_rate": 3.788738719338898e-06, "loss": 0.86404693, "num_input_tokens_seen": 53373410, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.75390625, "step": 2457, "time_per_iteration": 2.378192901611328 }, { "auxiliary_loss_clip": 0.01104148, "auxiliary_loss_mlp": 0.01035294, "balance_loss_clip": 1.01481414, "balance_loss_mlp": 1.03054821, "epoch": 0.14778295505786862, "flos": 18769411117440.0, "grad_norm": 1.9567926522310815, "language_loss": 0.74974108, "learning_rate": 3.788569676447723e-06, "loss": 0.77113551, "num_input_tokens_seen": 53391430, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.734375, "step": 2458, "time_per_iteration": 2.3949851989746094 }, { "auxiliary_loss_clip": 0.01111074, "auxiliary_loss_mlp": 0.01041301, "balance_loss_clip": 1.01924753, "balance_loss_mlp": 1.03039289, "epoch": 0.1478430783105366, "flos": 22892444150400.0, "grad_norm": 1.8526540964536895, "language_loss": 0.83802259, "learning_rate": 3.7884005697270976e-06, "loss": 0.85954636, "num_input_tokens_seen": 53409960, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.8046875, "step": 2459, "time_per_iteration": 2.386124849319458 }, { "auxiliary_loss_clip": 0.01102934, "auxiliary_loss_mlp": 0.01039875, "balance_loss_clip": 1.02070618, "balance_loss_mlp": 1.03005552, "epoch": 0.14790320156320458, "flos": 15084433313280.0, "grad_norm": 2.3751272671493258, "language_loss": 0.75208676, "learning_rate": 3.7882313991830553e-06, "loss": 0.77351487, "num_input_tokens_seen": 53426160, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.7265625, "step": 2460, "time_per_iteration": 2.3616836071014404 }, { "auxiliary_loss_clip": 0.01109286, "auxiliary_loss_mlp": 0.01040809, "balance_loss_clip": 1.01885045, "balance_loss_mlp": 1.03090692, "epoch": 0.14796332481587254, "flos": 26503580695680.0, "grad_norm": 1.695323430815126, "language_loss": 0.81721282, "learning_rate": 3.788062164821635e-06, "loss": 0.83871377, "num_input_tokens_seen": 53448530, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.78515625, "step": 2461, "time_per_iteration": 2.438246250152588 }, { "auxiliary_loss_clip": 0.01108934, "auxiliary_loss_mlp": 0.01041782, "balance_loss_clip": 1.01964498, "balance_loss_mlp": 1.03071451, "epoch": 0.1480234480685405, "flos": 17562324170880.0, "grad_norm": 3.2664391766448513, "language_loss": 0.65738714, "learning_rate": 3.7878928666488755e-06, "loss": 0.67889428, "num_input_tokens_seen": 53465915, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.78125, "step": 2462, "time_per_iteration": 2.344008207321167 }, { "auxiliary_loss_clip": 0.01106101, "auxiliary_loss_mlp": 0.01047585, "balance_loss_clip": 1.02500653, "balance_loss_mlp": 1.02892792, "epoch": 0.14808357132120847, "flos": 53580121580160.0, "grad_norm": 2.134080966410557, "language_loss": 0.67159075, "learning_rate": 3.787723504670818e-06, "loss": 0.69312757, "num_input_tokens_seen": 53496055, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.7734375, "step": 2463, "time_per_iteration": 2.6848130226135254 }, { "auxiliary_loss_clip": 0.01104801, "auxiliary_loss_mlp": 0.01044845, "balance_loss_clip": 1.02301788, "balance_loss_mlp": 1.02841341, "epoch": 0.14814369457387644, "flos": 19828152230400.0, "grad_norm": 1.6024968555364416, "language_loss": 0.76646018, "learning_rate": 3.7875540788935076e-06, "loss": 0.78795666, "num_input_tokens_seen": 53513790, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.765625, "step": 2464, "time_per_iteration": 3.7671191692352295 }, { "auxiliary_loss_clip": 0.01104148, "auxiliary_loss_mlp": 0.01039635, "balance_loss_clip": 1.01981044, "balance_loss_mlp": 1.03094208, "epoch": 0.1482038178265444, "flos": 23913828241920.0, "grad_norm": 1.6632214555847842, "language_loss": 0.79578698, "learning_rate": 3.7873845893229896e-06, "loss": 0.81722486, "num_input_tokens_seen": 53533410, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.73046875, "step": 2465, "time_per_iteration": 2.411268711090088 }, { "auxiliary_loss_clip": 0.01109104, "auxiliary_loss_mlp": 0.01042305, "balance_loss_clip": 1.02058506, "balance_loss_mlp": 1.0299412, "epoch": 0.1482639410792124, "flos": 24169357549440.0, "grad_norm": 2.5067273738761964, "language_loss": 0.76638633, "learning_rate": 3.7872150359653143e-06, "loss": 0.78790045, "num_input_tokens_seen": 53554775, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.79296875, "step": 2466, "time_per_iteration": 3.780672788619995 }, { "auxiliary_loss_clip": 0.01025631, "auxiliary_loss_mlp": 0.01002746, "balance_loss_clip": 1.00039744, "balance_loss_mlp": 1.00557256, "epoch": 0.14832406433188036, "flos": 66188025578880.0, "grad_norm": 0.7811321793737293, "language_loss": 0.6012544, "learning_rate": 3.787045418826531e-06, "loss": 0.62153816, "num_input_tokens_seen": 53609675, "router_z_loss_clip": 0.0234375, "router_z_loss_mlp": 0.20117188, "step": 2467, "time_per_iteration": 4.413452863693237 }, { "auxiliary_loss_clip": 0.01102364, "auxiliary_loss_mlp": 0.01035207, "balance_loss_clip": 1.01523924, "balance_loss_mlp": 1.02893591, "epoch": 0.14838418758454833, "flos": 25410066001920.0, "grad_norm": 2.1207902730080836, "language_loss": 0.87758505, "learning_rate": 3.7868757379126938e-06, "loss": 0.89896071, "num_input_tokens_seen": 53626950, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.734375, "step": 2468, "time_per_iteration": 3.744333505630493 }, { "auxiliary_loss_clip": 0.01106776, "auxiliary_loss_mlp": 0.01040982, "balance_loss_clip": 1.01863039, "balance_loss_mlp": 1.02884674, "epoch": 0.1484443108372163, "flos": 23288918083200.0, "grad_norm": 2.031910697410879, "language_loss": 0.76093054, "learning_rate": 3.7867059932298578e-06, "loss": 0.78240806, "num_input_tokens_seen": 53644200, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.78125, "step": 2469, "time_per_iteration": 2.4004931449890137 }, { "auxiliary_loss_clip": 0.01105523, "auxiliary_loss_mlp": 0.01045265, "balance_loss_clip": 1.02372372, "balance_loss_mlp": 1.0301652, "epoch": 0.14850443408988426, "flos": 14646797020800.0, "grad_norm": 2.357346122684486, "language_loss": 0.75776291, "learning_rate": 3.786536184784081e-06, "loss": 0.77927077, "num_input_tokens_seen": 53659650, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.75390625, "step": 2470, "time_per_iteration": 2.352248430252075 }, { "auxiliary_loss_clip": 0.01102071, "auxiliary_loss_mlp": 0.01042208, "balance_loss_clip": 1.02152479, "balance_loss_mlp": 1.02874684, "epoch": 0.14856455734255222, "flos": 23547240299520.0, "grad_norm": 1.958258548759413, "language_loss": 0.72282279, "learning_rate": 3.786366312581423e-06, "loss": 0.74426562, "num_input_tokens_seen": 53680275, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.734375, "step": 2471, "time_per_iteration": 2.4054150581359863 }, { "auxiliary_loss_clip": 0.01108779, "auxiliary_loss_mlp": 0.01039437, "balance_loss_clip": 1.01688242, "balance_loss_mlp": 1.02943885, "epoch": 0.1486246805952202, "flos": 18076315340160.0, "grad_norm": 2.567091590849504, "language_loss": 0.89306957, "learning_rate": 3.786196376627947e-06, "loss": 0.91455173, "num_input_tokens_seen": 53698270, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.796875, "step": 2472, "time_per_iteration": 2.360830783843994 }, { "auxiliary_loss_clip": 0.01105657, "auxiliary_loss_mlp": 0.01043823, "balance_loss_clip": 1.02275848, "balance_loss_mlp": 1.02937889, "epoch": 0.14868480384788818, "flos": 19352635246080.0, "grad_norm": 3.2100663064713113, "language_loss": 0.80462313, "learning_rate": 3.7860263769297163e-06, "loss": 0.82611787, "num_input_tokens_seen": 53716845, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.76171875, "step": 2473, "time_per_iteration": 2.360668420791626 }, { "auxiliary_loss_clip": 0.01109533, "auxiliary_loss_mlp": 0.01041752, "balance_loss_clip": 1.02066338, "balance_loss_mlp": 1.02998114, "epoch": 0.14874492710055615, "flos": 22199103993600.0, "grad_norm": 2.565127652122542, "language_loss": 0.77484328, "learning_rate": 3.7858563134927985e-06, "loss": 0.79635614, "num_input_tokens_seen": 53734970, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.796875, "step": 2474, "time_per_iteration": 2.384188413619995 }, { "auxiliary_loss_clip": 0.01106567, "auxiliary_loss_mlp": 0.01043922, "balance_loss_clip": 1.02127266, "balance_loss_mlp": 1.02885747, "epoch": 0.1488050503532241, "flos": 21102447277440.0, "grad_norm": 3.3138585561777703, "language_loss": 0.82387251, "learning_rate": 3.785686186323263e-06, "loss": 0.84537739, "num_input_tokens_seen": 53753415, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.77734375, "step": 2475, "time_per_iteration": 2.36897611618042 }, { "auxiliary_loss_clip": 0.01108567, "auxiliary_loss_mlp": 0.01047392, "balance_loss_clip": 1.0271976, "balance_loss_mlp": 1.03358066, "epoch": 0.14886517360589208, "flos": 12785751797760.0, "grad_norm": 1.8469100074926545, "language_loss": 0.80300593, "learning_rate": 3.785515995427181e-06, "loss": 0.82456547, "num_input_tokens_seen": 53770305, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.75, "step": 2476, "time_per_iteration": 2.3844544887542725 }, { "auxiliary_loss_clip": 0.01100131, "auxiliary_loss_mlp": 0.01044251, "balance_loss_clip": 1.02424812, "balance_loss_mlp": 1.02915752, "epoch": 0.14892529685856004, "flos": 29021586572160.0, "grad_norm": 1.650390794059293, "language_loss": 0.77724421, "learning_rate": 3.7853457408106257e-06, "loss": 0.79868805, "num_input_tokens_seen": 53788895, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.7109375, "step": 2477, "time_per_iteration": 2.4232230186462402 }, { "auxiliary_loss_clip": 0.01023444, "auxiliary_loss_mlp": 0.01002709, "balance_loss_clip": 1.0001694, "balance_loss_mlp": 1.00380075, "epoch": 0.148985420111228, "flos": 61923105313920.0, "grad_norm": 0.8200409354329088, "language_loss": 0.60067445, "learning_rate": 3.785175422479673e-06, "loss": 0.62093598, "num_input_tokens_seen": 53850260, "router_z_loss_clip": 0.02539062, "router_z_loss_mlp": 0.19628906, "step": 2478, "time_per_iteration": 3.0559191703796387 }, { "auxiliary_loss_clip": 0.01106866, "auxiliary_loss_mlp": 0.01040803, "balance_loss_clip": 1.02027547, "balance_loss_mlp": 1.03009081, "epoch": 0.149045543363896, "flos": 23913967887360.0, "grad_norm": 3.065683946321891, "language_loss": 0.70812583, "learning_rate": 3.785005040440402e-06, "loss": 0.72960258, "num_input_tokens_seen": 53867520, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.765625, "step": 2479, "time_per_iteration": 2.4103593826293945 }, { "auxiliary_loss_clip": 0.01105955, "auxiliary_loss_mlp": 0.01036613, "balance_loss_clip": 1.01666939, "balance_loss_mlp": 1.03049088, "epoch": 0.14910566661656396, "flos": 23653411344000.0, "grad_norm": 1.8489014792152119, "language_loss": 0.81115156, "learning_rate": 3.784834594698892e-06, "loss": 0.83257723, "num_input_tokens_seen": 53886620, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.7578125, "step": 2480, "time_per_iteration": 2.4197895526885986 }, { "auxiliary_loss_clip": 0.01108557, "auxiliary_loss_mlp": 0.01041216, "balance_loss_clip": 1.02054477, "balance_loss_mlp": 1.03099465, "epoch": 0.14916578986923193, "flos": 20514440292480.0, "grad_norm": 2.551710077764799, "language_loss": 0.84128141, "learning_rate": 3.7846640852612275e-06, "loss": 0.86277914, "num_input_tokens_seen": 53902230, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.77734375, "step": 2481, "time_per_iteration": 2.365886926651001 }, { "auxiliary_loss_clip": 0.01106418, "auxiliary_loss_mlp": 0.01048773, "balance_loss_clip": 1.02586138, "balance_loss_mlp": 1.02866387, "epoch": 0.1492259131218999, "flos": 22490733513600.0, "grad_norm": 2.197025405151839, "language_loss": 0.7761029, "learning_rate": 3.7844935121334917e-06, "loss": 0.79765475, "num_input_tokens_seen": 53919475, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.77734375, "step": 2482, "time_per_iteration": 2.3777050971984863 }, { "auxiliary_loss_clip": 0.01113147, "auxiliary_loss_mlp": 0.01039511, "balance_loss_clip": 1.01711154, "balance_loss_mlp": 1.03127205, "epoch": 0.14928603637456786, "flos": 23184736986240.0, "grad_norm": 2.387801286820446, "language_loss": 0.78827822, "learning_rate": 3.7843228753217726e-06, "loss": 0.8098048, "num_input_tokens_seen": 53939150, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.8203125, "step": 2483, "time_per_iteration": 2.389840841293335 }, { "auxiliary_loss_clip": 0.01102808, "auxiliary_loss_mlp": 0.01034901, "balance_loss_clip": 1.01655495, "balance_loss_mlp": 1.02996683, "epoch": 0.14934615962723582, "flos": 21652154634240.0, "grad_norm": 1.7036932947225292, "language_loss": 0.70134556, "learning_rate": 3.784152174832161e-06, "loss": 0.72272271, "num_input_tokens_seen": 53958735, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.7265625, "step": 2484, "time_per_iteration": 2.3964152336120605 }, { "auxiliary_loss_clip": 0.0110853, "auxiliary_loss_mlp": 0.01039956, "balance_loss_clip": 1.01690102, "balance_loss_mlp": 1.02992082, "epoch": 0.1494062828799038, "flos": 27009018581760.0, "grad_norm": 1.9461525369762271, "language_loss": 0.84372914, "learning_rate": 3.783981410670747e-06, "loss": 0.86521399, "num_input_tokens_seen": 53975065, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.78515625, "step": 2485, "time_per_iteration": 2.395015239715576 }, { "auxiliary_loss_clip": 0.01109191, "auxiliary_loss_mlp": 0.010414, "balance_loss_clip": 1.01866639, "balance_loss_mlp": 1.03184628, "epoch": 0.14946640613257178, "flos": 21213889937280.0, "grad_norm": 2.3685290325684805, "language_loss": 0.85049736, "learning_rate": 3.7838105828436246e-06, "loss": 0.87200332, "num_input_tokens_seen": 53993330, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.7734375, "step": 2486, "time_per_iteration": 2.380772590637207 }, { "auxiliary_loss_clip": 0.01102168, "auxiliary_loss_mlp": 0.01035574, "balance_loss_clip": 1.01717949, "balance_loss_mlp": 1.0275681, "epoch": 0.14952652938523975, "flos": 13370023267200.0, "grad_norm": 2.4486603015435375, "language_loss": 0.74685532, "learning_rate": 3.7836396913568924e-06, "loss": 0.7682327, "num_input_tokens_seen": 54010515, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.74609375, "step": 2487, "time_per_iteration": 2.348187208175659 }, { "auxiliary_loss_clip": 0.01105893, "auxiliary_loss_mlp": 0.01045336, "balance_loss_clip": 1.0232228, "balance_loss_mlp": 1.0319078, "epoch": 0.1495866526379077, "flos": 35516234684160.0, "grad_norm": 1.970596711140622, "language_loss": 0.71655691, "learning_rate": 3.783468736216647e-06, "loss": 0.73806924, "num_input_tokens_seen": 54031315, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.7421875, "step": 2488, "time_per_iteration": 2.507781982421875 }, { "auxiliary_loss_clip": 0.01108562, "auxiliary_loss_mlp": 0.01045898, "balance_loss_clip": 1.02378464, "balance_loss_mlp": 1.02948689, "epoch": 0.14964677589057568, "flos": 17631976066560.0, "grad_norm": 3.4016547959130485, "language_loss": 0.70374882, "learning_rate": 3.78329771742899e-06, "loss": 0.7252934, "num_input_tokens_seen": 54045965, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.79296875, "step": 2489, "time_per_iteration": 2.3227648735046387 }, { "auxiliary_loss_clip": 0.01107973, "auxiliary_loss_mlp": 0.01046761, "balance_loss_clip": 1.02527964, "balance_loss_mlp": 1.02966833, "epoch": 0.14970689914324364, "flos": 20184476232960.0, "grad_norm": 3.089408897003641, "language_loss": 0.82155168, "learning_rate": 3.7831266350000246e-06, "loss": 0.843099, "num_input_tokens_seen": 54059960, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.78515625, "step": 2490, "time_per_iteration": 2.3514554500579834 }, { "auxiliary_loss_clip": 0.01109634, "auxiliary_loss_mlp": 0.01042063, "balance_loss_clip": 1.02196455, "balance_loss_mlp": 1.03133702, "epoch": 0.1497670223959116, "flos": 37227293239680.0, "grad_norm": 1.8708118573962265, "language_loss": 0.79381263, "learning_rate": 3.7829554889358566e-06, "loss": 0.81532961, "num_input_tokens_seen": 54079330, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.78515625, "step": 2491, "time_per_iteration": 2.5021839141845703 }, { "auxiliary_loss_clip": 0.01106918, "auxiliary_loss_mlp": 0.01040376, "balance_loss_clip": 1.01766706, "balance_loss_mlp": 1.02717793, "epoch": 0.1498271456485796, "flos": 24454877581440.0, "grad_norm": 1.7980417862337945, "language_loss": 0.9063071, "learning_rate": 3.782784279242593e-06, "loss": 0.92778003, "num_input_tokens_seen": 54097555, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.796875, "step": 2492, "time_per_iteration": 2.4079651832580566 }, { "auxiliary_loss_clip": 0.01105784, "auxiliary_loss_mlp": 0.01048743, "balance_loss_clip": 1.02727377, "balance_loss_mlp": 1.02955317, "epoch": 0.14988726890124757, "flos": 16252662049920.0, "grad_norm": 4.15489378567907, "language_loss": 0.78564751, "learning_rate": 3.782613005926345e-06, "loss": 0.80719274, "num_input_tokens_seen": 54115600, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.76171875, "step": 2493, "time_per_iteration": 2.36659836769104 }, { "auxiliary_loss_clip": 0.01106706, "auxiliary_loss_mlp": 0.01043433, "balance_loss_clip": 1.02074707, "balance_loss_mlp": 1.02827418, "epoch": 0.14994739215391553, "flos": 20665544123520.0, "grad_norm": 2.2290031291976846, "language_loss": 0.80210996, "learning_rate": 3.7824416689932236e-06, "loss": 0.82361138, "num_input_tokens_seen": 54135220, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.78515625, "step": 2494, "time_per_iteration": 2.386484146118164 }, { "auxiliary_loss_clip": 0.01104888, "auxiliary_loss_mlp": 0.01045692, "balance_loss_clip": 1.02410316, "balance_loss_mlp": 1.02876639, "epoch": 0.1500075154065835, "flos": 70649961845760.0, "grad_norm": 16.878931362562213, "language_loss": 0.6622541, "learning_rate": 3.782270268449345e-06, "loss": 0.68375999, "num_input_tokens_seen": 54161065, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.76171875, "step": 2495, "time_per_iteration": 2.770857572555542 }, { "auxiliary_loss_clip": 0.01022901, "auxiliary_loss_mlp": 0.01007046, "balance_loss_clip": 1.00465024, "balance_loss_mlp": 1.00302672, "epoch": 0.15006763865925146, "flos": 68008955783040.0, "grad_norm": 0.8867353178954155, "language_loss": 0.59524918, "learning_rate": 3.7820988043008242e-06, "loss": 0.61554861, "num_input_tokens_seen": 54225095, "router_z_loss_clip": 0.02392578, "router_z_loss_mlp": 0.19921875, "step": 2496, "time_per_iteration": 3.040221691131592 }, { "auxiliary_loss_clip": 0.01107105, "auxiliary_loss_mlp": 0.0104946, "balance_loss_clip": 1.02659595, "balance_loss_mlp": 1.02788699, "epoch": 0.15012776191191943, "flos": 18915278244480.0, "grad_norm": 1.9285751832320503, "language_loss": 0.65387583, "learning_rate": 3.7819272765537817e-06, "loss": 0.6754415, "num_input_tokens_seen": 54243750, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.7890625, "step": 2497, "time_per_iteration": 2.3551132678985596 }, { "auxiliary_loss_clip": 0.01109416, "auxiliary_loss_mlp": 0.01040362, "balance_loss_clip": 1.01925039, "balance_loss_mlp": 1.03077793, "epoch": 0.1501878851645874, "flos": 23700054787200.0, "grad_norm": 1.4946450898699988, "language_loss": 0.75399512, "learning_rate": 3.781755685214338e-06, "loss": 0.77549291, "num_input_tokens_seen": 54266185, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.78515625, "step": 2498, "time_per_iteration": 2.451472759246826 }, { "auxiliary_loss_clip": 0.01110941, "auxiliary_loss_mlp": 0.01048753, "balance_loss_clip": 1.02396941, "balance_loss_mlp": 1.03037, "epoch": 0.15024800841725539, "flos": 20411481093120.0, "grad_norm": 3.5897244410704174, "language_loss": 0.72171801, "learning_rate": 3.7815840302886174e-06, "loss": 0.74331498, "num_input_tokens_seen": 54283940, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 0.8046875, "step": 2499, "time_per_iteration": 2.3646938800811768 }, { "auxiliary_loss_clip": 0.01106481, "auxiliary_loss_mlp": 0.01044837, "balance_loss_clip": 1.02206814, "balance_loss_mlp": 1.02982831, "epoch": 0.15030813166992335, "flos": 31829685868800.0, "grad_norm": 2.078369693629021, "language_loss": 0.71735597, "learning_rate": 3.7814123117827446e-06, "loss": 0.73886919, "num_input_tokens_seen": 54304830, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.76953125, "step": 2500, "time_per_iteration": 2.465881586074829 }, { "auxiliary_loss_clip": 0.0110495, "auxiliary_loss_mlp": 0.01051187, "balance_loss_clip": 1.0273335, "balance_loss_mlp": 1.02872014, "epoch": 0.15036825492259132, "flos": 35656515993600.0, "grad_norm": 1.7714213938379884, "language_loss": 0.64922321, "learning_rate": 3.7812405297028496e-06, "loss": 0.67078459, "num_input_tokens_seen": 54325595, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.76171875, "step": 2501, "time_per_iteration": 2.4886040687561035 }, { "auxiliary_loss_clip": 0.01102843, "auxiliary_loss_mlp": 0.01047496, "balance_loss_clip": 1.02550197, "balance_loss_mlp": 1.02860188, "epoch": 0.15042837817525928, "flos": 18837317445120.0, "grad_norm": 3.2510901341753407, "language_loss": 0.83437526, "learning_rate": 3.7810686840550627e-06, "loss": 0.85587859, "num_input_tokens_seen": 54342180, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.7421875, "step": 2502, "time_per_iteration": 2.3517165184020996 }, { "auxiliary_loss_clip": 0.01101775, "auxiliary_loss_mlp": 0.01043332, "balance_loss_clip": 1.02318525, "balance_loss_mlp": 1.02661479, "epoch": 0.15048850142792725, "flos": 19534567674240.0, "grad_norm": 1.8809738964560827, "language_loss": 0.77416849, "learning_rate": 3.780896774845515e-06, "loss": 0.79561961, "num_input_tokens_seen": 54360255, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.75, "step": 2503, "time_per_iteration": 3.7314107418060303 }, { "auxiliary_loss_clip": 0.01104913, "auxiliary_loss_mlp": 0.01038449, "balance_loss_clip": 1.01751566, "balance_loss_mlp": 1.02834249, "epoch": 0.1505486246805952, "flos": 22016473338240.0, "grad_norm": 1.886192327546137, "language_loss": 0.8547039, "learning_rate": 3.780724802080342e-06, "loss": 0.8761375, "num_input_tokens_seen": 54378260, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.765625, "step": 2504, "time_per_iteration": 2.382795810699463 }, { "auxiliary_loss_clip": 0.01104583, "auxiliary_loss_mlp": 0.01036428, "balance_loss_clip": 1.01629364, "balance_loss_mlp": 1.02975178, "epoch": 0.15060874793326318, "flos": 20742038645760.0, "grad_norm": 1.6319880378542269, "language_loss": 0.83300298, "learning_rate": 3.780552765765682e-06, "loss": 0.85441315, "num_input_tokens_seen": 54399745, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.75, "step": 2505, "time_per_iteration": 2.389429807662964 }, { "auxiliary_loss_clip": 0.01102799, "auxiliary_loss_mlp": 0.01041655, "balance_loss_clip": 1.02066219, "balance_loss_mlp": 1.02746129, "epoch": 0.15066887118593117, "flos": 16470973981440.0, "grad_norm": 2.5789060434706066, "language_loss": 0.75912398, "learning_rate": 3.7803806659076736e-06, "loss": 0.78056848, "num_input_tokens_seen": 54417105, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.75390625, "step": 2506, "time_per_iteration": 3.8077516555786133 }, { "auxiliary_loss_clip": 0.0110723, "auxiliary_loss_mlp": 0.01043255, "balance_loss_clip": 1.02203536, "balance_loss_mlp": 1.02894378, "epoch": 0.15072899443859913, "flos": 19858457157120.0, "grad_norm": 3.620360501858079, "language_loss": 0.76408052, "learning_rate": 3.7802085025124596e-06, "loss": 0.78558534, "num_input_tokens_seen": 54433920, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.78125, "step": 2507, "time_per_iteration": 3.6984262466430664 }, { "auxiliary_loss_clip": 0.01101382, "auxiliary_loss_mlp": 0.01037609, "balance_loss_clip": 1.01672363, "balance_loss_mlp": 1.02788413, "epoch": 0.1507891176912671, "flos": 20775206304000.0, "grad_norm": 1.8528607674820419, "language_loss": 0.68635213, "learning_rate": 3.780036275586183e-06, "loss": 0.70774198, "num_input_tokens_seen": 54451540, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.734375, "step": 2508, "time_per_iteration": 2.3735949993133545 }, { "auxiliary_loss_clip": 0.0110717, "auxiliary_loss_mlp": 0.01039804, "balance_loss_clip": 1.01870346, "balance_loss_mlp": 1.02987397, "epoch": 0.15084924094393506, "flos": 23585505016320.0, "grad_norm": 1.703027506053113, "language_loss": 0.77397841, "learning_rate": 3.77986398513499e-06, "loss": 0.79544818, "num_input_tokens_seen": 54470800, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.7734375, "step": 2509, "time_per_iteration": 2.409031867980957 }, { "auxiliary_loss_clip": 0.01111394, "auxiliary_loss_mlp": 0.01046605, "balance_loss_clip": 1.02228665, "balance_loss_mlp": 1.02996719, "epoch": 0.15090936419660303, "flos": 18910460476800.0, "grad_norm": 2.139432909344303, "language_loss": 0.79934525, "learning_rate": 3.7796916311650306e-06, "loss": 0.82092535, "num_input_tokens_seen": 54486525, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.8125, "step": 2510, "time_per_iteration": 2.332805871963501 }, { "auxiliary_loss_clip": 0.0110901, "auxiliary_loss_mlp": 0.01047356, "balance_loss_clip": 1.0243609, "balance_loss_mlp": 1.03031814, "epoch": 0.150969487449271, "flos": 17927341102080.0, "grad_norm": 2.0902943664394393, "language_loss": 0.73790503, "learning_rate": 3.779519213682454e-06, "loss": 0.75946862, "num_input_tokens_seen": 54503795, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.7890625, "step": 2511, "time_per_iteration": 2.363016128540039 }, { "auxiliary_loss_clip": 0.01105998, "auxiliary_loss_mlp": 0.01040421, "balance_loss_clip": 1.02032197, "balance_loss_mlp": 1.03014183, "epoch": 0.151029610701939, "flos": 24241941999360.0, "grad_norm": 1.9770655402867399, "language_loss": 0.69080341, "learning_rate": 3.7793467326934147e-06, "loss": 0.71226764, "num_input_tokens_seen": 54523025, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.7578125, "step": 2512, "time_per_iteration": 2.397308349609375 }, { "auxiliary_loss_clip": 0.0110867, "auxiliary_loss_mlp": 0.01042614, "balance_loss_clip": 1.02091765, "balance_loss_mlp": 1.0323503, "epoch": 0.15108973395460695, "flos": 30261212772480.0, "grad_norm": 2.6598247798626575, "language_loss": 0.73773617, "learning_rate": 3.7791741882040677e-06, "loss": 0.75924897, "num_input_tokens_seen": 54545025, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.765625, "step": 2513, "time_per_iteration": 2.45658802986145 }, { "auxiliary_loss_clip": 0.01020814, "auxiliary_loss_mlp": 0.010182, "balance_loss_clip": 1.01548231, "balance_loss_mlp": 1.00186658, "epoch": 0.15114985720727492, "flos": 60434443319040.0, "grad_norm": 0.8834424408720046, "language_loss": 0.64798552, "learning_rate": 3.7790015802205703e-06, "loss": 0.66837567, "num_input_tokens_seen": 54604545, "router_z_loss_clip": 0.02722168, "router_z_loss_mlp": 0.18945312, "step": 2514, "time_per_iteration": 2.968867778778076 }, { "auxiliary_loss_clip": 0.01103398, "auxiliary_loss_mlp": 0.01038986, "balance_loss_clip": 1.01708698, "balance_loss_mlp": 1.02875233, "epoch": 0.15120998045994288, "flos": 20520654514560.0, "grad_norm": 5.903596646191268, "language_loss": 0.73183751, "learning_rate": 3.778828908749082e-06, "loss": 0.75326133, "num_input_tokens_seen": 54620590, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.74609375, "step": 2515, "time_per_iteration": 2.3934273719787598 }, { "auxiliary_loss_clip": 0.0110383, "auxiliary_loss_mlp": 0.01039183, "balance_loss_clip": 1.0183332, "balance_loss_mlp": 1.02974045, "epoch": 0.15127010371261085, "flos": 21177824636160.0, "grad_norm": 1.8364367976316958, "language_loss": 0.7763139, "learning_rate": 3.7786561737957664e-06, "loss": 0.79774404, "num_input_tokens_seen": 54640410, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.7421875, "step": 2516, "time_per_iteration": 2.3890445232391357 }, { "auxiliary_loss_clip": 0.01022081, "auxiliary_loss_mlp": 0.01006885, "balance_loss_clip": 1.00452423, "balance_loss_mlp": 1.00267529, "epoch": 0.1513302269652788, "flos": 65317500938880.0, "grad_norm": 0.7261458198856392, "language_loss": 0.54662186, "learning_rate": 3.7784833753667867e-06, "loss": 0.56691152, "num_input_tokens_seen": 54701430, "router_z_loss_clip": 0.02355957, "router_z_loss_mlp": 0.19335938, "step": 2517, "time_per_iteration": 3.0439090728759766 }, { "auxiliary_loss_clip": 0.01105508, "auxiliary_loss_mlp": 0.0104157, "balance_loss_clip": 1.01917112, "balance_loss_mlp": 1.02838504, "epoch": 0.15139035021794678, "flos": 19134812073600.0, "grad_norm": 2.059296618904718, "language_loss": 0.78328919, "learning_rate": 3.7783105134683108e-06, "loss": 0.80475998, "num_input_tokens_seen": 54720845, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.7734375, "step": 2518, "time_per_iteration": 2.396969795227051 }, { "auxiliary_loss_clip": 0.01109758, "auxiliary_loss_mlp": 0.01046193, "balance_loss_clip": 1.02384138, "balance_loss_mlp": 1.03013384, "epoch": 0.15145047347061477, "flos": 26577352131840.0, "grad_norm": 2.0281620108176632, "language_loss": 0.69986463, "learning_rate": 3.7781375881065066e-06, "loss": 0.7214241, "num_input_tokens_seen": 54740495, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.796875, "step": 2519, "time_per_iteration": 2.4198665618896484 }, { "auxiliary_loss_clip": 0.01106441, "auxiliary_loss_mlp": 0.01041322, "balance_loss_clip": 1.02094889, "balance_loss_mlp": 1.02969408, "epoch": 0.15151059672328274, "flos": 20301923646720.0, "grad_norm": 2.499711858783702, "language_loss": 0.78746629, "learning_rate": 3.7779645992875453e-06, "loss": 0.80894399, "num_input_tokens_seen": 54758415, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.76953125, "step": 2520, "time_per_iteration": 2.375433921813965 }, { "auxiliary_loss_clip": 0.0111098, "auxiliary_loss_mlp": 0.01047152, "balance_loss_clip": 1.02419233, "balance_loss_mlp": 1.03058589, "epoch": 0.1515707199759507, "flos": 27227330513280.0, "grad_norm": 1.7823679850582101, "language_loss": 0.74635911, "learning_rate": 3.7777915470176013e-06, "loss": 0.7679404, "num_input_tokens_seen": 54779355, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.8046875, "step": 2521, "time_per_iteration": 2.4408998489379883 }, { "auxiliary_loss_clip": 0.01110329, "auxiliary_loss_mlp": 0.01043944, "balance_loss_clip": 1.0210675, "balance_loss_mlp": 1.03026247, "epoch": 0.15163084322861867, "flos": 23586203243520.0, "grad_norm": 1.9815758785267137, "language_loss": 0.81626248, "learning_rate": 3.7776184313028504e-06, "loss": 0.83780521, "num_input_tokens_seen": 54799465, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.80078125, "step": 2522, "time_per_iteration": 2.3857479095458984 }, { "auxiliary_loss_clip": 0.01027446, "auxiliary_loss_mlp": 0.01011436, "balance_loss_clip": 1.00920677, "balance_loss_mlp": 1.00768375, "epoch": 0.15169096648128663, "flos": 66886427882880.0, "grad_norm": 0.8246270249677684, "language_loss": 0.57857478, "learning_rate": 3.7774452521494703e-06, "loss": 0.59896362, "num_input_tokens_seen": 54857665, "router_z_loss_clip": 0.02233887, "router_z_loss_mlp": 0.19726562, "step": 2523, "time_per_iteration": 2.964157819747925 }, { "auxiliary_loss_clip": 0.01105539, "auxiliary_loss_mlp": 0.01043856, "balance_loss_clip": 1.02152777, "balance_loss_mlp": 1.02922678, "epoch": 0.1517510897339546, "flos": 29094171022080.0, "grad_norm": 1.7355717654770246, "language_loss": 0.74829638, "learning_rate": 3.777272009563641e-06, "loss": 0.76979029, "num_input_tokens_seen": 54879895, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.765625, "step": 2524, "time_per_iteration": 2.434436559677124 }, { "auxiliary_loss_clip": 0.01103708, "auxiliary_loss_mlp": 0.01045557, "balance_loss_clip": 1.02325308, "balance_loss_mlp": 1.0275718, "epoch": 0.1518112129866226, "flos": 18405546261120.0, "grad_norm": 2.2503896736230757, "language_loss": 0.74488342, "learning_rate": 3.7770987035515454e-06, "loss": 0.76637608, "num_input_tokens_seen": 54898245, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.76171875, "step": 2525, "time_per_iteration": 2.3705458641052246 }, { "auxiliary_loss_clip": 0.01110369, "auxiliary_loss_mlp": 0.01043974, "balance_loss_clip": 1.02100289, "balance_loss_mlp": 1.03120804, "epoch": 0.15187133623929056, "flos": 19424451646080.0, "grad_norm": 1.7240981580721493, "language_loss": 0.79515433, "learning_rate": 3.7769253341193677e-06, "loss": 0.81669778, "num_input_tokens_seen": 54917060, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.79296875, "step": 2526, "time_per_iteration": 2.3717384338378906 }, { "auxiliary_loss_clip": 0.01101619, "auxiliary_loss_mlp": 0.01044334, "balance_loss_clip": 1.0238061, "balance_loss_mlp": 1.0290668, "epoch": 0.15193145949195852, "flos": 17565256725120.0, "grad_norm": 1.7227662650603675, "language_loss": 0.84842026, "learning_rate": 3.7767519012732968e-06, "loss": 0.86987978, "num_input_tokens_seen": 54936365, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.7265625, "step": 2527, "time_per_iteration": 2.3688910007476807 }, { "auxiliary_loss_clip": 0.01107457, "auxiliary_loss_mlp": 0.01039621, "balance_loss_clip": 1.01920068, "balance_loss_mlp": 1.0299511, "epoch": 0.15199158274462649, "flos": 36174731437440.0, "grad_norm": 2.117843194012049, "language_loss": 0.69113839, "learning_rate": 3.77657840501952e-06, "loss": 0.71260917, "num_input_tokens_seen": 54961365, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.77734375, "step": 2528, "time_per_iteration": 2.517273187637329 }, { "auxiliary_loss_clip": 0.01107373, "auxiliary_loss_mlp": 0.01045553, "balance_loss_clip": 1.02377367, "balance_loss_mlp": 1.03085923, "epoch": 0.15205170599729445, "flos": 23072980124160.0, "grad_norm": 1.8420005400037076, "language_loss": 0.86785525, "learning_rate": 3.77640484536423e-06, "loss": 0.88938451, "num_input_tokens_seen": 54980750, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.765625, "step": 2529, "time_per_iteration": 2.3985602855682373 }, { "auxiliary_loss_clip": 0.01104006, "auxiliary_loss_mlp": 0.01037226, "balance_loss_clip": 1.0161376, "balance_loss_mlp": 1.02957439, "epoch": 0.15211182924996242, "flos": 21907299916800.0, "grad_norm": 1.9081949275850647, "language_loss": 0.83878064, "learning_rate": 3.7762312223136206e-06, "loss": 0.86019295, "num_input_tokens_seen": 54999675, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.74609375, "step": 2530, "time_per_iteration": 2.369981050491333 }, { "auxiliary_loss_clip": 0.01107426, "auxiliary_loss_mlp": 0.01041581, "balance_loss_clip": 1.01987326, "balance_loss_mlp": 1.03080463, "epoch": 0.15217195250263038, "flos": 13880662945920.0, "grad_norm": 2.195533786066148, "language_loss": 0.80004138, "learning_rate": 3.7760575358738885e-06, "loss": 0.82153153, "num_input_tokens_seen": 55018295, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.765625, "step": 2531, "time_per_iteration": 2.3685219287872314 }, { "auxiliary_loss_clip": 0.011084, "auxiliary_loss_mlp": 0.01044006, "balance_loss_clip": 1.02345395, "balance_loss_mlp": 1.03170586, "epoch": 0.15223207575529837, "flos": 24534165012480.0, "grad_norm": 1.812882691072394, "language_loss": 0.78945243, "learning_rate": 3.7758837860512306e-06, "loss": 0.81097651, "num_input_tokens_seen": 55037975, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.765625, "step": 2532, "time_per_iteration": 2.4467689990997314 }, { "auxiliary_loss_clip": 0.01106842, "auxiliary_loss_mlp": 0.01041883, "balance_loss_clip": 1.02085471, "balance_loss_mlp": 1.03198981, "epoch": 0.15229219900796634, "flos": 25555618926720.0, "grad_norm": 4.311532411440558, "language_loss": 0.87922168, "learning_rate": 3.775709972851849e-06, "loss": 0.90070897, "num_input_tokens_seen": 55057135, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.75, "step": 2533, "time_per_iteration": 2.43026065826416 }, { "auxiliary_loss_clip": 0.01106862, "auxiliary_loss_mlp": 0.01046504, "balance_loss_clip": 1.02447379, "balance_loss_mlp": 1.03032994, "epoch": 0.1523523222606343, "flos": 18216980674560.0, "grad_norm": 2.273913263189555, "language_loss": 0.78457522, "learning_rate": 3.775536096281946e-06, "loss": 0.80610883, "num_input_tokens_seen": 55075525, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.765625, "step": 2534, "time_per_iteration": 2.351072072982788 }, { "auxiliary_loss_clip": 0.0110932, "auxiliary_loss_mlp": 0.01040589, "balance_loss_clip": 1.01833248, "balance_loss_mlp": 1.02856922, "epoch": 0.15241244551330227, "flos": 13259278834560.0, "grad_norm": 3.0967219246986604, "language_loss": 0.76634681, "learning_rate": 3.7753621563477268e-06, "loss": 0.78784585, "num_input_tokens_seen": 55090845, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.80859375, "step": 2535, "time_per_iteration": 2.3607687950134277 }, { "auxiliary_loss_clip": 0.01112375, "auxiliary_loss_mlp": 0.01041494, "balance_loss_clip": 1.01929736, "balance_loss_mlp": 1.03086329, "epoch": 0.15247256876597023, "flos": 19714649800320.0, "grad_norm": 1.9995141359856081, "language_loss": 0.78141522, "learning_rate": 3.7751881530553993e-06, "loss": 0.80295384, "num_input_tokens_seen": 55108750, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.81640625, "step": 2536, "time_per_iteration": 2.3622381687164307 }, { "auxiliary_loss_clip": 0.01105197, "auxiliary_loss_mlp": 0.01042185, "balance_loss_clip": 1.02139473, "balance_loss_mlp": 1.03205538, "epoch": 0.1525326920186382, "flos": 20374822298880.0, "grad_norm": 3.0883739704224134, "language_loss": 0.76058221, "learning_rate": 3.775014086411173e-06, "loss": 0.78205609, "num_input_tokens_seen": 55126750, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.73046875, "step": 2537, "time_per_iteration": 2.370582342147827 }, { "auxiliary_loss_clip": 0.0110693, "auxiliary_loss_mlp": 0.01042626, "balance_loss_clip": 1.02166939, "balance_loss_mlp": 1.03100705, "epoch": 0.15259281527130616, "flos": 13589103248640.0, "grad_norm": 2.696911866493913, "language_loss": 0.77871943, "learning_rate": 3.7748399564212595e-06, "loss": 0.80021489, "num_input_tokens_seen": 55144690, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.7578125, "step": 2538, "time_per_iteration": 2.358154058456421 }, { "auxiliary_loss_clip": 0.01102209, "auxiliary_loss_mlp": 0.0103443, "balance_loss_clip": 1.01554728, "balance_loss_mlp": 1.02940416, "epoch": 0.15265293852397416, "flos": 22859171758080.0, "grad_norm": 2.018316998131727, "language_loss": 0.89714652, "learning_rate": 3.7746657630918735e-06, "loss": 0.91851294, "num_input_tokens_seen": 55166055, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.7265625, "step": 2539, "time_per_iteration": 2.408341646194458 }, { "auxiliary_loss_clip": 0.01106273, "auxiliary_loss_mlp": 0.0104616, "balance_loss_clip": 1.02464235, "balance_loss_mlp": 1.02910089, "epoch": 0.15271306177664212, "flos": 29236931038080.0, "grad_norm": 2.0070992879303664, "language_loss": 0.93298948, "learning_rate": 3.7744915064292313e-06, "loss": 0.95451379, "num_input_tokens_seen": 55186285, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.7734375, "step": 2540, "time_per_iteration": 2.4411098957061768 }, { "auxiliary_loss_clip": 0.01100278, "auxiliary_loss_mlp": 0.01037735, "balance_loss_clip": 1.01820827, "balance_loss_mlp": 1.02696204, "epoch": 0.1527731850293101, "flos": 31244995463040.0, "grad_norm": 1.6043658348108991, "language_loss": 0.75354832, "learning_rate": 3.7743171864395524e-06, "loss": 0.77492845, "num_input_tokens_seen": 55207915, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.734375, "step": 2541, "time_per_iteration": 2.463897466659546 }, { "auxiliary_loss_clip": 0.01101776, "auxiliary_loss_mlp": 0.01041522, "balance_loss_clip": 1.02097011, "balance_loss_mlp": 1.02875042, "epoch": 0.15283330828197805, "flos": 22381001510400.0, "grad_norm": 1.6350841919306038, "language_loss": 0.81249166, "learning_rate": 3.774142803129057e-06, "loss": 0.83392459, "num_input_tokens_seen": 55227860, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.73046875, "step": 2542, "time_per_iteration": 2.385455369949341 }, { "auxiliary_loss_clip": 0.01106315, "auxiliary_loss_mlp": 0.01043719, "balance_loss_clip": 1.02259505, "balance_loss_mlp": 1.02995694, "epoch": 0.15289343153464602, "flos": 25518960132480.0, "grad_norm": 1.7618678381994837, "language_loss": 0.77379119, "learning_rate": 3.7739683565039674e-06, "loss": 0.79529154, "num_input_tokens_seen": 55247330, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.765625, "step": 2543, "time_per_iteration": 3.8703291416168213 }, { "auxiliary_loss_clip": 0.01103589, "auxiliary_loss_mlp": 0.01037816, "balance_loss_clip": 1.01696634, "balance_loss_mlp": 1.02916515, "epoch": 0.15295355478731398, "flos": 22708940711040.0, "grad_norm": 1.898209789518137, "language_loss": 0.86182797, "learning_rate": 3.7737938465705115e-06, "loss": 0.88324201, "num_input_tokens_seen": 55266195, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.74609375, "step": 2544, "time_per_iteration": 2.3836936950683594 }, { "auxiliary_loss_clip": 0.01105948, "auxiliary_loss_mlp": 0.01043145, "balance_loss_clip": 1.02056634, "balance_loss_mlp": 1.02821922, "epoch": 0.15301367803998198, "flos": 23250967568640.0, "grad_norm": 2.012905755527917, "language_loss": 0.8258521, "learning_rate": 3.773619273334916e-06, "loss": 0.84734297, "num_input_tokens_seen": 55283305, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.77734375, "step": 2545, "time_per_iteration": 3.8111298084259033 }, { "auxiliary_loss_clip": 0.01103459, "auxiliary_loss_mlp": 0.01039394, "balance_loss_clip": 1.01785302, "balance_loss_mlp": 1.02929902, "epoch": 0.15307380129264994, "flos": 25885059315840.0, "grad_norm": 2.5491668962390683, "language_loss": 0.71097058, "learning_rate": 3.77344463680341e-06, "loss": 0.73239911, "num_input_tokens_seen": 55303035, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.7421875, "step": 2546, "time_per_iteration": 2.455937147140503 }, { "auxiliary_loss_clip": 0.01103855, "auxiliary_loss_mlp": 0.01043118, "balance_loss_clip": 1.02050447, "balance_loss_mlp": 1.02750611, "epoch": 0.1531339245453179, "flos": 46971482279040.0, "grad_norm": 1.8089644794002437, "language_loss": 0.77684152, "learning_rate": 3.7732699369822276e-06, "loss": 0.79831123, "num_input_tokens_seen": 55327570, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.76171875, "step": 2547, "time_per_iteration": 3.963167190551758 }, { "auxiliary_loss_clip": 0.01105039, "auxiliary_loss_mlp": 0.0104707, "balance_loss_clip": 1.02424145, "balance_loss_mlp": 1.02820408, "epoch": 0.15319404779798587, "flos": 35880588299520.0, "grad_norm": 2.501128504882286, "language_loss": 0.74221045, "learning_rate": 3.7730951738776025e-06, "loss": 0.76373148, "num_input_tokens_seen": 55351090, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.76953125, "step": 2548, "time_per_iteration": 2.5057740211486816 }, { "auxiliary_loss_clip": 0.01107012, "auxiliary_loss_mlp": 0.01040167, "balance_loss_clip": 1.01805377, "balance_loss_mlp": 1.02945113, "epoch": 0.15325417105065384, "flos": 25663500627840.0, "grad_norm": 1.3539366184629202, "language_loss": 0.80474466, "learning_rate": 3.7729203474957715e-06, "loss": 0.82621646, "num_input_tokens_seen": 55371050, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.7734375, "step": 2549, "time_per_iteration": 2.4154052734375 }, { "auxiliary_loss_clip": 0.01105433, "auxiliary_loss_mlp": 0.01038403, "balance_loss_clip": 1.0184834, "balance_loss_mlp": 1.02960038, "epoch": 0.1533142943033218, "flos": 18769830053760.0, "grad_norm": 1.7346121941856547, "language_loss": 0.74965739, "learning_rate": 3.7727454578429735e-06, "loss": 0.77109581, "num_input_tokens_seen": 55390375, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.7578125, "step": 2550, "time_per_iteration": 2.369154691696167 }, { "auxiliary_loss_clip": 0.01107496, "auxiliary_loss_mlp": 0.01047447, "balance_loss_clip": 1.0250597, "balance_loss_mlp": 1.02971649, "epoch": 0.15337441755598977, "flos": 23106392161920.0, "grad_norm": 2.2196697060420028, "language_loss": 0.77113855, "learning_rate": 3.7725705049254507e-06, "loss": 0.79268789, "num_input_tokens_seen": 55408890, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.77734375, "step": 2551, "time_per_iteration": 2.379380941390991 }, { "auxiliary_loss_clip": 0.01020893, "auxiliary_loss_mlp": 0.01003217, "balance_loss_clip": 1.00111914, "balance_loss_mlp": 1.00219727, "epoch": 0.15343454080865776, "flos": 59857712703360.0, "grad_norm": 0.9447315928812187, "language_loss": 0.56754923, "learning_rate": 3.7723954887494457e-06, "loss": 0.58779031, "num_input_tokens_seen": 55463815, "router_z_loss_clip": 0.02099609, "router_z_loss_mlp": 0.1875, "step": 2552, "time_per_iteration": 2.9446704387664795 }, { "auxiliary_loss_clip": 0.01107918, "auxiliary_loss_mlp": 0.01040425, "balance_loss_clip": 1.01816869, "balance_loss_mlp": 1.02918124, "epoch": 0.15349466406132573, "flos": 11910095187840.0, "grad_norm": 2.3160070802126898, "language_loss": 0.88518476, "learning_rate": 3.772220409321205e-06, "loss": 0.90666825, "num_input_tokens_seen": 55481050, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.7890625, "step": 2553, "time_per_iteration": 2.35170841217041 }, { "auxiliary_loss_clip": 0.01108115, "auxiliary_loss_mlp": 0.0104021, "balance_loss_clip": 1.01832306, "balance_loss_mlp": 1.02948189, "epoch": 0.1535547873139937, "flos": 24095795581440.0, "grad_norm": 3.7418927186046984, "language_loss": 0.78330117, "learning_rate": 3.7720452666469766e-06, "loss": 0.80478442, "num_input_tokens_seen": 55500050, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.78515625, "step": 2554, "time_per_iteration": 2.4206840991973877 }, { "auxiliary_loss_clip": 0.01111027, "auxiliary_loss_mlp": 0.01044822, "balance_loss_clip": 1.02211225, "balance_loss_mlp": 1.03178763, "epoch": 0.15361491056666166, "flos": 17565501104640.0, "grad_norm": 2.5973429021719974, "language_loss": 0.77826989, "learning_rate": 3.7718700607330114e-06, "loss": 0.79982841, "num_input_tokens_seen": 55518125, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.79296875, "step": 2555, "time_per_iteration": 2.356719970703125 }, { "auxiliary_loss_clip": 0.01102546, "auxiliary_loss_mlp": 0.01039915, "balance_loss_clip": 1.02055502, "balance_loss_mlp": 1.02782702, "epoch": 0.15367503381932962, "flos": 25044874513920.0, "grad_norm": 1.6601553386463048, "language_loss": 0.77098221, "learning_rate": 3.7716947915855607e-06, "loss": 0.7924068, "num_input_tokens_seen": 55540960, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.74609375, "step": 2556, "time_per_iteration": 2.433222532272339 }, { "auxiliary_loss_clip": 0.01102502, "auxiliary_loss_mlp": 0.01033546, "balance_loss_clip": 1.01278019, "balance_loss_mlp": 1.0288049, "epoch": 0.15373515707199759, "flos": 21506252595840.0, "grad_norm": 1.9197042729823908, "language_loss": 0.89976764, "learning_rate": 3.7715194592108805e-06, "loss": 0.92112815, "num_input_tokens_seen": 55559210, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.734375, "step": 2557, "time_per_iteration": 2.376099109649658 }, { "auxiliary_loss_clip": 0.01105636, "auxiliary_loss_mlp": 0.01042414, "balance_loss_clip": 1.01971674, "balance_loss_mlp": 1.02806902, "epoch": 0.15379528032466555, "flos": 25993534510080.0, "grad_norm": 1.9933591767860965, "language_loss": 0.71279323, "learning_rate": 3.7713440636152276e-06, "loss": 0.73427367, "num_input_tokens_seen": 55578925, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.7734375, "step": 2558, "time_per_iteration": 2.4223055839538574 }, { "auxiliary_loss_clip": 0.0110847, "auxiliary_loss_mlp": 0.01044002, "balance_loss_clip": 1.02246046, "balance_loss_mlp": 1.03012931, "epoch": 0.15385540357733354, "flos": 19276420014720.0, "grad_norm": 2.4142626335273087, "language_loss": 0.91885328, "learning_rate": 3.7711686048048613e-06, "loss": 0.94037807, "num_input_tokens_seen": 55597255, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.78125, "step": 2559, "time_per_iteration": 2.371500253677368 }, { "auxiliary_loss_clip": 0.01106853, "auxiliary_loss_mlp": 0.01046784, "balance_loss_clip": 1.02265644, "balance_loss_mlp": 1.02879667, "epoch": 0.1539155268300015, "flos": 28547850067200.0, "grad_norm": 2.5031426019798815, "language_loss": 0.63263065, "learning_rate": 3.7709930827860445e-06, "loss": 0.65416706, "num_input_tokens_seen": 55619515, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 0.78125, "step": 2560, "time_per_iteration": 2.436124324798584 }, { "auxiliary_loss_clip": 0.01105785, "auxiliary_loss_mlp": 0.01048935, "balance_loss_clip": 1.02601123, "balance_loss_mlp": 1.02788079, "epoch": 0.15397565008266947, "flos": 23546821363200.0, "grad_norm": 1.8593641817635, "language_loss": 0.88214654, "learning_rate": 3.770817497565039e-06, "loss": 0.90369374, "num_input_tokens_seen": 55640050, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.78125, "step": 2561, "time_per_iteration": 2.4040687084198 }, { "auxiliary_loss_clip": 0.01103428, "auxiliary_loss_mlp": 0.01036151, "balance_loss_clip": 1.01612425, "balance_loss_mlp": 1.02934492, "epoch": 0.15403577333533744, "flos": 17128842330240.0, "grad_norm": 1.8613872011541217, "language_loss": 0.8302772, "learning_rate": 3.770641849148113e-06, "loss": 0.85167301, "num_input_tokens_seen": 55658695, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.7421875, "step": 2562, "time_per_iteration": 2.362842559814453 }, { "auxiliary_loss_clip": 0.01111663, "auxiliary_loss_mlp": 0.01048927, "balance_loss_clip": 1.024418, "balance_loss_mlp": 1.03037024, "epoch": 0.1540958965880054, "flos": 17893545039360.0, "grad_norm": 2.6615220319396173, "language_loss": 0.74593759, "learning_rate": 3.7704661375415336e-06, "loss": 0.76754344, "num_input_tokens_seen": 55676340, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 0.8125, "step": 2563, "time_per_iteration": 2.3713762760162354 }, { "auxiliary_loss_clip": 0.01106939, "auxiliary_loss_mlp": 0.01037819, "balance_loss_clip": 1.01505017, "balance_loss_mlp": 1.02760148, "epoch": 0.15415601984067337, "flos": 32159684839680.0, "grad_norm": 2.127160502792017, "language_loss": 0.7599268, "learning_rate": 3.770290362751572e-06, "loss": 0.78137439, "num_input_tokens_seen": 55698890, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.79296875, "step": 2564, "time_per_iteration": 2.4816644191741943 }, { "auxiliary_loss_clip": 0.01103838, "auxiliary_loss_mlp": 0.01044494, "balance_loss_clip": 1.02382302, "balance_loss_mlp": 1.02820563, "epoch": 0.15421614309334136, "flos": 24023280954240.0, "grad_norm": 2.3929769772717817, "language_loss": 0.70904052, "learning_rate": 3.7701145247845006e-06, "loss": 0.73052388, "num_input_tokens_seen": 55718535, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.7578125, "step": 2565, "time_per_iteration": 2.4122776985168457 }, { "auxiliary_loss_clip": 0.01102412, "auxiliary_loss_mlp": 0.01045623, "balance_loss_clip": 1.02420139, "balance_loss_mlp": 1.02667618, "epoch": 0.15427626634600933, "flos": 24385225685760.0, "grad_norm": 128.62132603744894, "language_loss": 0.72072661, "learning_rate": 3.7699386236465954e-06, "loss": 0.74220693, "num_input_tokens_seen": 55738970, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.7578125, "step": 2566, "time_per_iteration": 2.411787271499634 }, { "auxiliary_loss_clip": 0.01101311, "auxiliary_loss_mlp": 0.01036585, "balance_loss_clip": 1.01612926, "balance_loss_mlp": 1.02717113, "epoch": 0.1543363895986773, "flos": 23330394645120.0, "grad_norm": 1.7865310614845986, "language_loss": 0.85019439, "learning_rate": 3.769762659344134e-06, "loss": 0.87157333, "num_input_tokens_seen": 55759585, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.7421875, "step": 2567, "time_per_iteration": 2.410080671310425 }, { "auxiliary_loss_clip": 0.01107123, "auxiliary_loss_mlp": 0.01041647, "balance_loss_clip": 1.02020097, "balance_loss_mlp": 1.0298934, "epoch": 0.15439651285134526, "flos": 24273294266880.0, "grad_norm": 1.8003980210100428, "language_loss": 0.78207928, "learning_rate": 3.7695866318833946e-06, "loss": 0.80356699, "num_input_tokens_seen": 55779250, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.7734375, "step": 2568, "time_per_iteration": 2.424752950668335 }, { "auxiliary_loss_clip": 0.01105023, "auxiliary_loss_mlp": 0.01037149, "balance_loss_clip": 1.0147258, "balance_loss_mlp": 1.02836204, "epoch": 0.15445663610401322, "flos": 22455052237440.0, "grad_norm": 1.8000672053146936, "language_loss": 0.70164311, "learning_rate": 3.769410541270661e-06, "loss": 0.72306484, "num_input_tokens_seen": 55800470, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.765625, "step": 2569, "time_per_iteration": 2.3858604431152344 }, { "auxiliary_loss_clip": 0.01100376, "auxiliary_loss_mlp": 0.0104229, "balance_loss_clip": 1.02138114, "balance_loss_mlp": 1.02779078, "epoch": 0.1545167593566812, "flos": 22048558744320.0, "grad_norm": 1.6523212296947989, "language_loss": 0.76557863, "learning_rate": 3.7692343875122167e-06, "loss": 0.78700531, "num_input_tokens_seen": 55817795, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.7265625, "step": 2570, "time_per_iteration": 2.38033390045166 }, { "auxiliary_loss_clip": 0.01103751, "auxiliary_loss_mlp": 0.01040179, "balance_loss_clip": 1.0186975, "balance_loss_mlp": 1.02916908, "epoch": 0.15457688260934915, "flos": 19317233260800.0, "grad_norm": 2.519584689727318, "language_loss": 0.77578133, "learning_rate": 3.769058170614348e-06, "loss": 0.79722065, "num_input_tokens_seen": 55836125, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.74609375, "step": 2571, "time_per_iteration": 2.377727508544922 }, { "auxiliary_loss_clip": 0.01104421, "auxiliary_loss_mlp": 0.01042916, "balance_loss_clip": 1.02193546, "balance_loss_mlp": 1.02845848, "epoch": 0.15463700586201715, "flos": 24132838400640.0, "grad_norm": 2.585517759659818, "language_loss": 0.82445037, "learning_rate": 3.768881890583344e-06, "loss": 0.84592372, "num_input_tokens_seen": 55855280, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.7578125, "step": 2572, "time_per_iteration": 2.402364730834961 }, { "auxiliary_loss_clip": 0.01107173, "auxiliary_loss_mlp": 0.01036979, "balance_loss_clip": 1.01581907, "balance_loss_mlp": 1.02824247, "epoch": 0.1546971291146851, "flos": 22419789897600.0, "grad_norm": 1.5434582100933483, "language_loss": 0.90369272, "learning_rate": 3.7687055474254946e-06, "loss": 0.92513418, "num_input_tokens_seen": 55875695, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.7890625, "step": 2573, "time_per_iteration": 2.402705669403076 }, { "auxiliary_loss_clip": 0.01106665, "auxiliary_loss_mlp": 0.01041862, "balance_loss_clip": 1.02113163, "balance_loss_mlp": 1.02890539, "epoch": 0.15475725236735308, "flos": 17529261246720.0, "grad_norm": 1.746927606225564, "language_loss": 0.70083201, "learning_rate": 3.7685291411470946e-06, "loss": 0.72231728, "num_input_tokens_seen": 55894575, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.77734375, "step": 2574, "time_per_iteration": 2.3665170669555664 }, { "auxiliary_loss_clip": 0.01105633, "auxiliary_loss_mlp": 0.01043801, "balance_loss_clip": 1.02161622, "balance_loss_mlp": 1.02813995, "epoch": 0.15481737562002104, "flos": 22560734522880.0, "grad_norm": 1.749430047223734, "language_loss": 0.82673597, "learning_rate": 3.768352671754439e-06, "loss": 0.84823036, "num_input_tokens_seen": 55912855, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.7734375, "step": 2575, "time_per_iteration": 2.394659996032715 }, { "auxiliary_loss_clip": 0.01105269, "auxiliary_loss_mlp": 0.01038444, "balance_loss_clip": 1.0179522, "balance_loss_mlp": 1.02768183, "epoch": 0.154877498872689, "flos": 24899391411840.0, "grad_norm": 2.0641206323001935, "language_loss": 0.85018152, "learning_rate": 3.7681761392538246e-06, "loss": 0.87161869, "num_input_tokens_seen": 55932375, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.7734375, "step": 2576, "time_per_iteration": 2.4128005504608154 }, { "auxiliary_loss_clip": 0.01102041, "auxiliary_loss_mlp": 0.01040264, "balance_loss_clip": 1.019629, "balance_loss_mlp": 1.026214, "epoch": 0.15493762212535697, "flos": 28146244164480.0, "grad_norm": 1.7035648936845607, "language_loss": 0.82154602, "learning_rate": 3.7679995436515525e-06, "loss": 0.84296906, "num_input_tokens_seen": 55953970, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.7578125, "step": 2577, "time_per_iteration": 2.43742036819458 }, { "auxiliary_loss_clip": 0.0110781, "auxiliary_loss_mlp": 0.01047088, "balance_loss_clip": 1.02558279, "balance_loss_mlp": 1.03000402, "epoch": 0.15499774537802496, "flos": 25409891445120.0, "grad_norm": 2.681903882832599, "language_loss": 0.76673013, "learning_rate": 3.7678228849539244e-06, "loss": 0.78827906, "num_input_tokens_seen": 55973120, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.78125, "step": 2578, "time_per_iteration": 2.4083006381988525 }, { "auxiliary_loss_clip": 0.01106969, "auxiliary_loss_mlp": 0.01040705, "balance_loss_clip": 1.01773357, "balance_loss_mlp": 1.02976203, "epoch": 0.15505786863069293, "flos": 22090454242560.0, "grad_norm": 2.1068175774035662, "language_loss": 0.82854289, "learning_rate": 3.767646163167245e-06, "loss": 0.85001969, "num_input_tokens_seen": 55993260, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 0.76953125, "step": 2579, "time_per_iteration": 2.4026007652282715 }, { "auxiliary_loss_clip": 0.01104262, "auxiliary_loss_mlp": 0.01040047, "balance_loss_clip": 1.02056837, "balance_loss_mlp": 1.03090668, "epoch": 0.1551179918833609, "flos": 18916116117120.0, "grad_norm": 2.178197886575931, "language_loss": 0.80735964, "learning_rate": 3.7674693782978206e-06, "loss": 0.82880276, "num_input_tokens_seen": 56012130, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.734375, "step": 2580, "time_per_iteration": 2.367403268814087 }, { "auxiliary_loss_clip": 0.01024101, "auxiliary_loss_mlp": 0.01002831, "balance_loss_clip": 1.00063789, "balance_loss_mlp": 1.00525832, "epoch": 0.15517811513602886, "flos": 66235821096960.0, "grad_norm": 0.8381763744793062, "language_loss": 0.58836788, "learning_rate": 3.7672925303519605e-06, "loss": 0.60863721, "num_input_tokens_seen": 56079045, "router_z_loss_clip": 0.02197266, "router_z_loss_mlp": 0.18847656, "step": 2581, "time_per_iteration": 3.164335250854492 }, { "auxiliary_loss_clip": 0.01108483, "auxiliary_loss_mlp": 0.01039474, "balance_loss_clip": 1.01849318, "balance_loss_mlp": 1.02883208, "epoch": 0.15523823838869683, "flos": 24020034197760.0, "grad_norm": 2.419429780114484, "language_loss": 0.8530618, "learning_rate": 3.7671156193359764e-06, "loss": 0.87454134, "num_input_tokens_seen": 56098745, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.796875, "step": 2582, "time_per_iteration": 3.7996675968170166 }, { "auxiliary_loss_clip": 0.01103487, "auxiliary_loss_mlp": 0.01044752, "balance_loss_clip": 1.02330661, "balance_loss_mlp": 1.02763653, "epoch": 0.1552983616413648, "flos": 20484030631680.0, "grad_norm": 2.3637131348154115, "language_loss": 0.78676498, "learning_rate": 3.766938645256182e-06, "loss": 0.80824739, "num_input_tokens_seen": 56117655, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.7578125, "step": 2583, "time_per_iteration": 2.4146738052368164 }, { "auxiliary_loss_clip": 0.01103412, "auxiliary_loss_mlp": 0.01042965, "balance_loss_clip": 1.02308095, "balance_loss_mlp": 1.02806306, "epoch": 0.15535848489403276, "flos": 32122362729600.0, "grad_norm": 1.787448485889266, "language_loss": 0.76157773, "learning_rate": 3.766761608118892e-06, "loss": 0.78304148, "num_input_tokens_seen": 56141960, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.75390625, "step": 2584, "time_per_iteration": 2.497995376586914 }, { "auxiliary_loss_clip": 0.01102306, "auxiliary_loss_mlp": 0.01039194, "balance_loss_clip": 1.01690185, "balance_loss_mlp": 1.02709401, "epoch": 0.15541860814670075, "flos": 19097455052160.0, "grad_norm": 2.1192931896292055, "language_loss": 0.75837165, "learning_rate": 3.766584507930424e-06, "loss": 0.77978659, "num_input_tokens_seen": 56161430, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.75, "step": 2585, "time_per_iteration": 5.134189605712891 }, { "auxiliary_loss_clip": 0.01100754, "auxiliary_loss_mlp": 0.01039866, "balance_loss_clip": 1.01944494, "balance_loss_mlp": 1.02741408, "epoch": 0.1554787313993687, "flos": 19171086842880.0, "grad_norm": 2.61309060370953, "language_loss": 0.61490977, "learning_rate": 3.7664073446971e-06, "loss": 0.63631594, "num_input_tokens_seen": 56179390, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.734375, "step": 2586, "time_per_iteration": 3.731452465057373 }, { "auxiliary_loss_clip": 0.011037, "auxiliary_loss_mlp": 0.0103897, "balance_loss_clip": 1.01838279, "balance_loss_mlp": 1.02729452, "epoch": 0.15553885465203668, "flos": 16142895135360.0, "grad_norm": 1.5537971677562452, "language_loss": 0.80979955, "learning_rate": 3.7662301184252413e-06, "loss": 0.83122623, "num_input_tokens_seen": 56198020, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.76171875, "step": 2587, "time_per_iteration": 2.4011423587799072 }, { "auxiliary_loss_clip": 0.01106464, "auxiliary_loss_mlp": 0.01045561, "balance_loss_clip": 1.02371013, "balance_loss_mlp": 1.02809954, "epoch": 0.15559897790470464, "flos": 25336608768000.0, "grad_norm": 1.796175972885769, "language_loss": 0.88489425, "learning_rate": 3.766052829121173e-06, "loss": 0.90641451, "num_input_tokens_seen": 56218165, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.78125, "step": 2588, "time_per_iteration": 2.447763442993164 }, { "auxiliary_loss_clip": 0.01104995, "auxiliary_loss_mlp": 0.01047148, "balance_loss_clip": 1.02462912, "balance_loss_mlp": 1.0295099, "epoch": 0.1556591011573726, "flos": 23147659255680.0, "grad_norm": 2.333558389343231, "language_loss": 0.64972603, "learning_rate": 3.765875476791222e-06, "loss": 0.67124742, "num_input_tokens_seen": 56237160, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.75390625, "step": 2589, "time_per_iteration": 2.421673536300659 }, { "auxiliary_loss_clip": 0.01104252, "auxiliary_loss_mlp": 0.01036745, "balance_loss_clip": 1.01425052, "balance_loss_mlp": 1.02719223, "epoch": 0.15571922441004057, "flos": 25369811337600.0, "grad_norm": 1.7241537957140867, "language_loss": 0.82660699, "learning_rate": 3.765698061441718e-06, "loss": 0.84801698, "num_input_tokens_seen": 56257610, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.76953125, "step": 2590, "time_per_iteration": 2.4393231868743896 }, { "auxiliary_loss_clip": 0.01103367, "auxiliary_loss_mlp": 0.01036363, "balance_loss_clip": 1.01496434, "balance_loss_mlp": 1.02695584, "epoch": 0.15577934766270854, "flos": 14500510957440.0, "grad_norm": 1.987275243360299, "language_loss": 0.79317725, "learning_rate": 3.7655205830789918e-06, "loss": 0.81457454, "num_input_tokens_seen": 56275215, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.765625, "step": 2591, "time_per_iteration": 2.373990774154663 }, { "auxiliary_loss_clip": 0.01102398, "auxiliary_loss_mlp": 0.01047158, "balance_loss_clip": 1.02709532, "balance_loss_mlp": 1.02724028, "epoch": 0.15583947091537653, "flos": 37413031006080.0, "grad_norm": 3.0056621303756965, "language_loss": 0.64931399, "learning_rate": 3.7653430417093777e-06, "loss": 0.67080957, "num_input_tokens_seen": 56297130, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.75390625, "step": 2592, "time_per_iteration": 2.553753614425659 }, { "auxiliary_loss_clip": 0.01107732, "auxiliary_loss_mlp": 0.01042511, "balance_loss_clip": 1.01964641, "balance_loss_mlp": 1.02997577, "epoch": 0.1558995941680445, "flos": 21833668126080.0, "grad_norm": 1.990849898249795, "language_loss": 0.81727475, "learning_rate": 3.765165437339211e-06, "loss": 0.83877718, "num_input_tokens_seen": 56314995, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 0.77734375, "step": 2593, "time_per_iteration": 2.397907018661499 }, { "auxiliary_loss_clip": 0.01100068, "auxiliary_loss_mlp": 0.01039175, "balance_loss_clip": 1.01808691, "balance_loss_mlp": 1.02801895, "epoch": 0.15595971742071246, "flos": 19791598170240.0, "grad_norm": 2.0859177285149797, "language_loss": 0.73165357, "learning_rate": 3.764987769974831e-06, "loss": 0.75304604, "num_input_tokens_seen": 56334005, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.71875, "step": 2594, "time_per_iteration": 2.378899335861206 }, { "auxiliary_loss_clip": 0.01098355, "auxiliary_loss_mlp": 0.01035254, "balance_loss_clip": 1.01486969, "balance_loss_mlp": 1.02698159, "epoch": 0.15601984067338043, "flos": 26720984931840.0, "grad_norm": 3.6423690182773587, "language_loss": 0.81098974, "learning_rate": 3.764810039622577e-06, "loss": 0.83232594, "num_input_tokens_seen": 56353795, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.71484375, "step": 2595, "time_per_iteration": 2.4521374702453613 }, { "auxiliary_loss_clip": 0.01100865, "auxiliary_loss_mlp": 0.01038839, "balance_loss_clip": 1.01794171, "balance_loss_mlp": 1.02678692, "epoch": 0.1560799639260484, "flos": 18368293973760.0, "grad_norm": 1.9787245185617681, "language_loss": 0.86365926, "learning_rate": 3.7646322462887927e-06, "loss": 0.88505626, "num_input_tokens_seen": 56373195, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.7421875, "step": 2596, "time_per_iteration": 2.3852343559265137 }, { "auxiliary_loss_clip": 0.01099539, "auxiliary_loss_mlp": 0.01037294, "balance_loss_clip": 1.01727879, "balance_loss_mlp": 1.02835238, "epoch": 0.15614008717871636, "flos": 22597951898880.0, "grad_norm": 1.657866688818024, "language_loss": 0.68346548, "learning_rate": 3.764454389979822e-06, "loss": 0.70483381, "num_input_tokens_seen": 56391525, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.7109375, "step": 2597, "time_per_iteration": 2.417179822921753 }, { "auxiliary_loss_clip": 0.01097967, "auxiliary_loss_mlp": 0.01040386, "balance_loss_clip": 1.02059722, "balance_loss_mlp": 1.02725589, "epoch": 0.15620021043138435, "flos": 22745774062080.0, "grad_norm": 1.783937869785717, "language_loss": 0.79627144, "learning_rate": 3.7642764707020134e-06, "loss": 0.81765497, "num_input_tokens_seen": 56410715, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.70703125, "step": 2598, "time_per_iteration": 2.415555238723755 }, { "auxiliary_loss_clip": 0.01096528, "auxiliary_loss_mlp": 0.01032579, "balance_loss_clip": 1.01294494, "balance_loss_mlp": 1.02566028, "epoch": 0.15626033368405232, "flos": 13114109934720.0, "grad_norm": 2.19423517852279, "language_loss": 0.82752991, "learning_rate": 3.764098488461716e-06, "loss": 0.84882104, "num_input_tokens_seen": 56429170, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.7109375, "step": 2599, "time_per_iteration": 2.381692409515381 }, { "auxiliary_loss_clip": 0.01107132, "auxiliary_loss_mlp": 0.01037977, "balance_loss_clip": 1.01541018, "balance_loss_mlp": 1.02894783, "epoch": 0.15632045693672028, "flos": 16471358006400.0, "grad_norm": 2.8044434070259467, "language_loss": 0.81662029, "learning_rate": 3.7639204432652808e-06, "loss": 0.83807135, "num_input_tokens_seen": 56445685, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.78125, "step": 2600, "time_per_iteration": 2.3765108585357666 }, { "auxiliary_loss_clip": 0.01106237, "auxiliary_loss_mlp": 0.0103855, "balance_loss_clip": 1.01822448, "balance_loss_mlp": 1.02960777, "epoch": 0.15638058018938825, "flos": 20849291942400.0, "grad_norm": 1.8183566185622821, "language_loss": 0.884462, "learning_rate": 3.7637423351190628e-06, "loss": 0.9059099, "num_input_tokens_seen": 56465900, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.765625, "step": 2601, "time_per_iteration": 2.4123878479003906 }, { "auxiliary_loss_clip": 0.01107621, "auxiliary_loss_mlp": 0.01056924, "balance_loss_clip": 1.03384519, "balance_loss_mlp": 1.03044391, "epoch": 0.1564407034420562, "flos": 21871129881600.0, "grad_norm": 1.6946203892584524, "language_loss": 0.78171384, "learning_rate": 3.7635641640294177e-06, "loss": 0.80335927, "num_input_tokens_seen": 56485020, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.7734375, "step": 2602, "time_per_iteration": 2.4028422832489014 }, { "auxiliary_loss_clip": 0.01101798, "auxiliary_loss_mlp": 0.01039578, "balance_loss_clip": 1.01946712, "balance_loss_mlp": 1.02789998, "epoch": 0.15650082669472418, "flos": 21833493569280.0, "grad_norm": 3.598417753453261, "language_loss": 0.73629385, "learning_rate": 3.7633859300027036e-06, "loss": 0.7577076, "num_input_tokens_seen": 56505205, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.73828125, "step": 2603, "time_per_iteration": 2.4165844917297363 }, { "auxiliary_loss_clip": 0.01103958, "auxiliary_loss_mlp": 0.01045743, "balance_loss_clip": 1.02513182, "balance_loss_mlp": 1.02844143, "epoch": 0.15656094994739214, "flos": 13799909237760.0, "grad_norm": 2.672069342832569, "language_loss": 0.87356353, "learning_rate": 3.7632076330452823e-06, "loss": 0.89506054, "num_input_tokens_seen": 56521495, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.75390625, "step": 2604, "time_per_iteration": 2.3788928985595703 }, { "auxiliary_loss_clip": 0.01102621, "auxiliary_loss_mlp": 0.01041581, "balance_loss_clip": 1.02092218, "balance_loss_mlp": 1.0275898, "epoch": 0.15662107320006013, "flos": 27306967057920.0, "grad_norm": 1.9506636409082554, "language_loss": 0.85097289, "learning_rate": 3.7630292731635155e-06, "loss": 0.87241483, "num_input_tokens_seen": 56540665, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.75, "step": 2605, "time_per_iteration": 2.4615745544433594 }, { "auxiliary_loss_clip": 0.01107971, "auxiliary_loss_mlp": 0.01042204, "balance_loss_clip": 1.02082968, "balance_loss_mlp": 1.02778459, "epoch": 0.1566811964527281, "flos": 26683942112640.0, "grad_norm": 2.1938220667631048, "language_loss": 0.73083031, "learning_rate": 3.762850850363769e-06, "loss": 0.75233209, "num_input_tokens_seen": 56560805, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.80078125, "step": 2606, "time_per_iteration": 2.446239471435547 }, { "auxiliary_loss_clip": 0.01104533, "auxiliary_loss_mlp": 0.0103651, "balance_loss_clip": 1.01601839, "balance_loss_mlp": 1.0298152, "epoch": 0.15674131970539606, "flos": 16102605559680.0, "grad_norm": 2.216983009685828, "language_loss": 0.76660913, "learning_rate": 3.7626723646524107e-06, "loss": 0.78801954, "num_input_tokens_seen": 56576335, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.74609375, "step": 2607, "time_per_iteration": 2.374535083770752 }, { "auxiliary_loss_clip": 0.01101664, "auxiliary_loss_mlp": 0.01040982, "balance_loss_clip": 1.02083576, "balance_loss_mlp": 1.02859378, "epoch": 0.15680144295806403, "flos": 19168747781760.0, "grad_norm": 2.121054199236041, "language_loss": 0.81724632, "learning_rate": 3.7624938160358096e-06, "loss": 0.83867276, "num_input_tokens_seen": 56595880, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.73046875, "step": 2608, "time_per_iteration": 2.399024486541748 }, { "auxiliary_loss_clip": 0.01107222, "auxiliary_loss_mlp": 0.01045988, "balance_loss_clip": 1.02292061, "balance_loss_mlp": 1.02944684, "epoch": 0.156861566210732, "flos": 20812388768640.0, "grad_norm": 2.286626616381914, "language_loss": 0.72848833, "learning_rate": 3.762315204520338e-06, "loss": 0.75002038, "num_input_tokens_seen": 56615130, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.77734375, "step": 2609, "time_per_iteration": 2.435127019882202 }, { "auxiliary_loss_clip": 0.01103131, "auxiliary_loss_mlp": 0.01038042, "balance_loss_clip": 1.01733494, "balance_loss_mlp": 1.02717757, "epoch": 0.15692168946339996, "flos": 20046883098240.0, "grad_norm": 2.2163270139322533, "language_loss": 0.71791583, "learning_rate": 3.7621365301123696e-06, "loss": 0.73932755, "num_input_tokens_seen": 56634005, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.7578125, "step": 2610, "time_per_iteration": 2.417283296585083 }, { "auxiliary_loss_clip": 0.0110366, "auxiliary_loss_mlp": 0.01042168, "balance_loss_clip": 1.02022147, "balance_loss_mlp": 1.02626252, "epoch": 0.15698181271606793, "flos": 21396939528960.0, "grad_norm": 1.6604171307479039, "language_loss": 0.72618192, "learning_rate": 3.7619577928182816e-06, "loss": 0.74764025, "num_input_tokens_seen": 56653480, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.7734375, "step": 2611, "time_per_iteration": 2.42850399017334 }, { "auxiliary_loss_clip": 0.01102773, "auxiliary_loss_mlp": 0.0104017, "balance_loss_clip": 1.02003598, "balance_loss_mlp": 1.02798891, "epoch": 0.15704193596873592, "flos": 20844858199680.0, "grad_norm": 2.1770139587214623, "language_loss": 0.70722824, "learning_rate": 3.7617789926444525e-06, "loss": 0.72865766, "num_input_tokens_seen": 56672270, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.74609375, "step": 2612, "time_per_iteration": 2.4021122455596924 }, { "auxiliary_loss_clip": 0.01105137, "auxiliary_loss_mlp": 0.01046943, "balance_loss_clip": 1.02634406, "balance_loss_mlp": 1.02847147, "epoch": 0.15710205922140388, "flos": 21761816814720.0, "grad_norm": 1.971352266797598, "language_loss": 0.75976723, "learning_rate": 3.761600129597262e-06, "loss": 0.78128803, "num_input_tokens_seen": 56691510, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.765625, "step": 2613, "time_per_iteration": 2.4084362983703613 }, { "auxiliary_loss_clip": 0.01103495, "auxiliary_loss_mlp": 0.01048327, "balance_loss_clip": 1.02672625, "balance_loss_mlp": 1.02705002, "epoch": 0.15716218247407185, "flos": 25006644708480.0, "grad_norm": 1.6309071429618132, "language_loss": 0.65967524, "learning_rate": 3.761421203683095e-06, "loss": 0.68119335, "num_input_tokens_seen": 56712230, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.765625, "step": 2614, "time_per_iteration": 2.4340224266052246 }, { "auxiliary_loss_clip": 0.01106212, "auxiliary_loss_mlp": 0.01040308, "balance_loss_clip": 1.01833797, "balance_loss_mlp": 1.02878881, "epoch": 0.1572223057267398, "flos": 20190795189120.0, "grad_norm": 2.356254018131287, "language_loss": 0.74882823, "learning_rate": 3.7612422149083362e-06, "loss": 0.77029347, "num_input_tokens_seen": 56727490, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.7734375, "step": 2615, "time_per_iteration": 2.355894088745117 }, { "auxiliary_loss_clip": 0.01100076, "auxiliary_loss_mlp": 0.0104314, "balance_loss_clip": 1.02350664, "balance_loss_mlp": 1.02799809, "epoch": 0.15728242897940778, "flos": 20958465363840.0, "grad_norm": 2.0172598137008455, "language_loss": 0.73029327, "learning_rate": 3.761063163279373e-06, "loss": 0.75172544, "num_input_tokens_seen": 56747385, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.72265625, "step": 2616, "time_per_iteration": 2.3996548652648926 }, { "auxiliary_loss_clip": 0.01103443, "auxiliary_loss_mlp": 0.01042533, "balance_loss_clip": 1.02121794, "balance_loss_mlp": 1.02759457, "epoch": 0.15734255223207574, "flos": 23037194113920.0, "grad_norm": 1.952295697980959, "language_loss": 0.72702718, "learning_rate": 3.7608840488025955e-06, "loss": 0.748487, "num_input_tokens_seen": 56768055, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.7578125, "step": 2617, "time_per_iteration": 2.3851678371429443 }, { "auxiliary_loss_clip": 0.01102768, "auxiliary_loss_mlp": 0.01037228, "balance_loss_clip": 1.01672435, "balance_loss_mlp": 1.02899408, "epoch": 0.15740267548474374, "flos": 20550435770880.0, "grad_norm": 2.7305945018535875, "language_loss": 0.74240804, "learning_rate": 3.760704871484396e-06, "loss": 0.76380801, "num_input_tokens_seen": 56785110, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.734375, "step": 2618, "time_per_iteration": 2.3909361362457275 }, { "auxiliary_loss_clip": 0.01106578, "auxiliary_loss_mlp": 0.01043279, "balance_loss_clip": 1.01898444, "balance_loss_mlp": 1.02754009, "epoch": 0.1574627987374117, "flos": 22666032783360.0, "grad_norm": 1.918600653750494, "language_loss": 0.78889054, "learning_rate": 3.7605256313311684e-06, "loss": 0.8103891, "num_input_tokens_seen": 56804975, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 0.7890625, "step": 2619, "time_per_iteration": 2.376849889755249 }, { "auxiliary_loss_clip": 0.01100153, "auxiliary_loss_mlp": 0.01035403, "balance_loss_clip": 1.01598394, "balance_loss_mlp": 1.02774501, "epoch": 0.15752292199007967, "flos": 16799716143360.0, "grad_norm": 1.9631199295647428, "language_loss": 0.76334906, "learning_rate": 3.7603463283493093e-06, "loss": 0.78470463, "num_input_tokens_seen": 56822470, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.72265625, "step": 2620, "time_per_iteration": 2.378913640975952 }, { "auxiliary_loss_clip": 0.01105, "auxiliary_loss_mlp": 0.01036747, "balance_loss_clip": 1.01451433, "balance_loss_mlp": 1.02806258, "epoch": 0.15758304524274763, "flos": 29824693643520.0, "grad_norm": 1.6950638831849694, "language_loss": 0.71077681, "learning_rate": 3.760166962545219e-06, "loss": 0.7321943, "num_input_tokens_seen": 56842100, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.76953125, "step": 2621, "time_per_iteration": 2.445683002471924 }, { "auxiliary_loss_clip": 0.01105344, "auxiliary_loss_mlp": 0.01040413, "balance_loss_clip": 1.01920557, "balance_loss_mlp": 1.02917194, "epoch": 0.1576431684954156, "flos": 53575478369280.0, "grad_norm": 2.043786453576383, "language_loss": 0.72216332, "learning_rate": 3.7599875339252962e-06, "loss": 0.74362087, "num_input_tokens_seen": 56865920, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.76171875, "step": 2622, "time_per_iteration": 4.0393757820129395 }, { "auxiliary_loss_clip": 0.01102739, "auxiliary_loss_mlp": 0.01036596, "balance_loss_clip": 1.01675987, "balance_loss_mlp": 1.0277462, "epoch": 0.15770329174808356, "flos": 20812563325440.0, "grad_norm": 1.728780941941876, "language_loss": 0.87419021, "learning_rate": 3.759808042495947e-06, "loss": 0.89558357, "num_input_tokens_seen": 56885265, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.75, "step": 2623, "time_per_iteration": 2.4093070030212402 }, { "auxiliary_loss_clip": 0.01103387, "auxiliary_loss_mlp": 0.01036534, "balance_loss_clip": 1.01710296, "balance_loss_mlp": 1.02887702, "epoch": 0.15776341500075153, "flos": 24972813734400.0, "grad_norm": 1.685512866388488, "language_loss": 0.81717169, "learning_rate": 3.7596284882635746e-06, "loss": 0.83857095, "num_input_tokens_seen": 56906710, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.74609375, "step": 2624, "time_per_iteration": 5.201894044876099 }, { "auxiliary_loss_clip": 0.01103976, "auxiliary_loss_mlp": 0.01039816, "balance_loss_clip": 1.01782215, "balance_loss_mlp": 1.02759087, "epoch": 0.15782353825341952, "flos": 21906846069120.0, "grad_norm": 2.6155308847246554, "language_loss": 0.7979489, "learning_rate": 3.7594488712345878e-06, "loss": 0.81938678, "num_input_tokens_seen": 56924275, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.765625, "step": 2625, "time_per_iteration": 2.397899866104126 }, { "auxiliary_loss_clip": 0.0110183, "auxiliary_loss_mlp": 0.01039829, "balance_loss_clip": 1.01994491, "balance_loss_mlp": 1.0284586, "epoch": 0.15788366150608749, "flos": 26175990608640.0, "grad_norm": 3.0363684040067476, "language_loss": 0.80167592, "learning_rate": 3.7592691914153967e-06, "loss": 0.82309252, "num_input_tokens_seen": 56941525, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.734375, "step": 2626, "time_per_iteration": 3.8083722591400146 }, { "auxiliary_loss_clip": 0.01104302, "auxiliary_loss_mlp": 0.0103839, "balance_loss_clip": 1.01789784, "balance_loss_mlp": 1.03113103, "epoch": 0.15794378475875545, "flos": 27708572960640.0, "grad_norm": 1.8168098782618698, "language_loss": 0.73536825, "learning_rate": 3.7590894488124134e-06, "loss": 0.75679517, "num_input_tokens_seen": 56962145, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.73046875, "step": 2627, "time_per_iteration": 2.457184076309204 }, { "auxiliary_loss_clip": 0.01102751, "auxiliary_loss_mlp": 0.01041898, "balance_loss_clip": 1.02078581, "balance_loss_mlp": 1.02858937, "epoch": 0.15800390801142342, "flos": 12129349726080.0, "grad_norm": 2.1391588192881947, "language_loss": 0.85239929, "learning_rate": 3.7589096434320534e-06, "loss": 0.87384582, "num_input_tokens_seen": 56977505, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.7421875, "step": 2628, "time_per_iteration": 2.3664088249206543 }, { "auxiliary_loss_clip": 0.01099321, "auxiliary_loss_mlp": 0.01036293, "balance_loss_clip": 1.01702857, "balance_loss_mlp": 1.0267477, "epoch": 0.15806403126409138, "flos": 20703669194880.0, "grad_norm": 1.8327200788202531, "language_loss": 0.76718879, "learning_rate": 3.7587297752807315e-06, "loss": 0.78854489, "num_input_tokens_seen": 56996770, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.7265625, "step": 2629, "time_per_iteration": 2.407407283782959 }, { "auxiliary_loss_clip": 0.01103757, "auxiliary_loss_mlp": 0.01045324, "balance_loss_clip": 1.02343762, "balance_loss_mlp": 1.02720749, "epoch": 0.15812415451675935, "flos": 17820751121280.0, "grad_norm": 2.4884600869974265, "language_loss": 0.73892325, "learning_rate": 3.758549844364869e-06, "loss": 0.76041412, "num_input_tokens_seen": 57014970, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.765625, "step": 2630, "time_per_iteration": 2.4013473987579346 }, { "auxiliary_loss_clip": 0.0110509, "auxiliary_loss_mlp": 0.01041841, "balance_loss_clip": 1.0194056, "balance_loss_mlp": 1.02786446, "epoch": 0.15818427776942734, "flos": 20083018222080.0, "grad_norm": 5.646154236075837, "language_loss": 0.83460271, "learning_rate": 3.7583698506908854e-06, "loss": 0.85607207, "num_input_tokens_seen": 57034045, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.76953125, "step": 2631, "time_per_iteration": 2.435514450073242 }, { "auxiliary_loss_clip": 0.01101833, "auxiliary_loss_mlp": 0.01036206, "balance_loss_clip": 1.01524854, "balance_loss_mlp": 1.02746201, "epoch": 0.1582444010220953, "flos": 21213855025920.0, "grad_norm": 1.702038878764565, "language_loss": 0.78231049, "learning_rate": 3.7581897942652046e-06, "loss": 0.80369091, "num_input_tokens_seen": 57053695, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.7421875, "step": 2632, "time_per_iteration": 2.405518054962158 }, { "auxiliary_loss_clip": 0.01104133, "auxiliary_loss_mlp": 0.01050743, "balance_loss_clip": 1.0299412, "balance_loss_mlp": 1.0280633, "epoch": 0.15830452427476327, "flos": 17857375004160.0, "grad_norm": 2.156080894809283, "language_loss": 0.83225524, "learning_rate": 3.7580096750942535e-06, "loss": 0.85380399, "num_input_tokens_seen": 57071290, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.76171875, "step": 2633, "time_per_iteration": 2.364070177078247 }, { "auxiliary_loss_clip": 0.01104832, "auxiliary_loss_mlp": 0.01040668, "balance_loss_clip": 1.02018774, "balance_loss_mlp": 1.02880466, "epoch": 0.15836464752743123, "flos": 24533815898880.0, "grad_norm": 1.6509486986117194, "language_loss": 0.77444232, "learning_rate": 3.7578294931844584e-06, "loss": 0.79589731, "num_input_tokens_seen": 57091465, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.7578125, "step": 2634, "time_per_iteration": 2.4315414428710938 }, { "auxiliary_loss_clip": 0.01104861, "auxiliary_loss_mlp": 0.01038563, "balance_loss_clip": 1.01715338, "balance_loss_mlp": 1.02793598, "epoch": 0.1584247707800992, "flos": 20119781750400.0, "grad_norm": 3.32648958753033, "language_loss": 0.88971549, "learning_rate": 3.757649248542251e-06, "loss": 0.91114974, "num_input_tokens_seen": 57110075, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.76953125, "step": 2635, "time_per_iteration": 2.402858018875122 }, { "auxiliary_loss_clip": 0.01104143, "auxiliary_loss_mlp": 0.01043664, "balance_loss_clip": 1.02177715, "balance_loss_mlp": 1.02616, "epoch": 0.15848489403276717, "flos": 20374927032960.0, "grad_norm": 2.140323991383923, "language_loss": 0.75747037, "learning_rate": 3.757468941174063e-06, "loss": 0.77894843, "num_input_tokens_seen": 57128945, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.78125, "step": 2636, "time_per_iteration": 2.39902400970459 }, { "auxiliary_loss_clip": 0.01107464, "auxiliary_loss_mlp": 0.01042391, "balance_loss_clip": 1.02092123, "balance_loss_mlp": 1.02948594, "epoch": 0.15854501728543513, "flos": 39345368958720.0, "grad_norm": 2.2281739836646084, "language_loss": 0.71450502, "learning_rate": 3.7572885710863293e-06, "loss": 0.73600358, "num_input_tokens_seen": 57152385, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.78125, "step": 2637, "time_per_iteration": 2.563585042953491 }, { "auxiliary_loss_clip": 0.01102063, "auxiliary_loss_mlp": 0.01035121, "balance_loss_clip": 1.01518965, "balance_loss_mlp": 1.02702391, "epoch": 0.15860514053810312, "flos": 24863046819840.0, "grad_norm": 1.9987676543931971, "language_loss": 0.77517295, "learning_rate": 3.7571081382854866e-06, "loss": 0.79654485, "num_input_tokens_seen": 57172620, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.75, "step": 2638, "time_per_iteration": 2.4379286766052246 }, { "auxiliary_loss_clip": 0.01105014, "auxiliary_loss_mlp": 0.01041608, "balance_loss_clip": 1.0185771, "balance_loss_mlp": 1.02786207, "epoch": 0.1586652637907711, "flos": 26176479367680.0, "grad_norm": 1.7938691790713672, "language_loss": 0.75311208, "learning_rate": 3.756927642777974e-06, "loss": 0.77457821, "num_input_tokens_seen": 57194680, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.7734375, "step": 2639, "time_per_iteration": 2.448011636734009 }, { "auxiliary_loss_clip": 0.01106245, "auxiliary_loss_mlp": 0.01049362, "balance_loss_clip": 1.02793968, "balance_loss_mlp": 1.0296948, "epoch": 0.15872538704343905, "flos": 19791039588480.0, "grad_norm": 1.8529297947319283, "language_loss": 0.81090569, "learning_rate": 3.7567470845702337e-06, "loss": 0.83246183, "num_input_tokens_seen": 57214675, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.765625, "step": 2640, "time_per_iteration": 2.4345805644989014 }, { "auxiliary_loss_clip": 0.0110238, "auxiliary_loss_mlp": 0.0104355, "balance_loss_clip": 1.02324867, "balance_loss_mlp": 1.02789259, "epoch": 0.15878551029610702, "flos": 28474113542400.0, "grad_norm": 2.238978024847191, "language_loss": 0.66688108, "learning_rate": 3.756566463668709e-06, "loss": 0.68834043, "num_input_tokens_seen": 57235830, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.74609375, "step": 2641, "time_per_iteration": 2.429933547973633 }, { "auxiliary_loss_clip": 0.01110032, "auxiliary_loss_mlp": 0.01047067, "balance_loss_clip": 1.02529955, "balance_loss_mlp": 1.02991748, "epoch": 0.15884563354877498, "flos": 24205562496000.0, "grad_norm": 2.0593057564250232, "language_loss": 0.75106114, "learning_rate": 3.756385780079845e-06, "loss": 0.77263212, "num_input_tokens_seen": 57255970, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.80078125, "step": 2642, "time_per_iteration": 2.417956590652466 }, { "auxiliary_loss_clip": 0.01099275, "auxiliary_loss_mlp": 0.01043671, "balance_loss_clip": 1.02241588, "balance_loss_mlp": 1.02721715, "epoch": 0.15890575680144295, "flos": 23948706556800.0, "grad_norm": 1.763166913728333, "language_loss": 0.70588106, "learning_rate": 3.756205033810091e-06, "loss": 0.72731048, "num_input_tokens_seen": 57274435, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.72265625, "step": 2643, "time_per_iteration": 2.4096288681030273 }, { "auxiliary_loss_clip": 0.01099905, "auxiliary_loss_mlp": 0.01037241, "balance_loss_clip": 1.01777434, "balance_loss_mlp": 1.02727807, "epoch": 0.15896588005411091, "flos": 21213959760000.0, "grad_norm": 2.136780877812778, "language_loss": 0.77865797, "learning_rate": 3.7560242248658963e-06, "loss": 0.8000294, "num_input_tokens_seen": 57293115, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.7265625, "step": 2644, "time_per_iteration": 2.4319334030151367 }, { "auxiliary_loss_clip": 0.01100841, "auxiliary_loss_mlp": 0.0104081, "balance_loss_clip": 1.02156985, "balance_loss_mlp": 1.02712774, "epoch": 0.1590260033067789, "flos": 24351255066240.0, "grad_norm": 1.8506680923764118, "language_loss": 0.8223685, "learning_rate": 3.7558433532537145e-06, "loss": 0.84378505, "num_input_tokens_seen": 57312565, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.734375, "step": 2645, "time_per_iteration": 2.4091849327087402 }, { "auxiliary_loss_clip": 0.01103085, "auxiliary_loss_mlp": 0.01038907, "balance_loss_clip": 1.01693726, "balance_loss_mlp": 1.02752018, "epoch": 0.15908612655944687, "flos": 32047648686720.0, "grad_norm": 2.162556548938065, "language_loss": 0.70025808, "learning_rate": 3.75566241898e-06, "loss": 0.72167802, "num_input_tokens_seen": 57333360, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.7578125, "step": 2646, "time_per_iteration": 2.494476079940796 }, { "auxiliary_loss_clip": 0.01098996, "auxiliary_loss_mlp": 0.01037495, "balance_loss_clip": 1.01734889, "balance_loss_mlp": 1.0268023, "epoch": 0.15914624981211484, "flos": 17784406529280.0, "grad_norm": 2.392693770113908, "language_loss": 0.62278962, "learning_rate": 3.7554814220512095e-06, "loss": 0.64415455, "num_input_tokens_seen": 57350575, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.72265625, "step": 2647, "time_per_iteration": 2.339232921600342 }, { "auxiliary_loss_clip": 0.01101412, "auxiliary_loss_mlp": 0.01039351, "balance_loss_clip": 1.01848936, "balance_loss_mlp": 1.02895546, "epoch": 0.1592063730647828, "flos": 17711542788480.0, "grad_norm": 2.094533690090337, "language_loss": 0.89786607, "learning_rate": 3.755300362473803e-06, "loss": 0.91927373, "num_input_tokens_seen": 57367570, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.7265625, "step": 2648, "time_per_iteration": 2.3754818439483643 }, { "auxiliary_loss_clip": 0.01099667, "auxiliary_loss_mlp": 0.0103661, "balance_loss_clip": 1.01740479, "balance_loss_mlp": 1.02806485, "epoch": 0.15926649631745077, "flos": 18802648598400.0, "grad_norm": 1.784209771623308, "language_loss": 0.91517699, "learning_rate": 3.7551192402542418e-06, "loss": 0.93653977, "num_input_tokens_seen": 57383980, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.71484375, "step": 2649, "time_per_iteration": 2.3740234375 }, { "auxiliary_loss_clip": 0.01108733, "auxiliary_loss_mlp": 0.01039118, "balance_loss_clip": 1.01682544, "balance_loss_mlp": 1.02754092, "epoch": 0.15932661957011873, "flos": 17565291636480.0, "grad_norm": 2.4707960811613074, "language_loss": 0.71221823, "learning_rate": 3.7549380553989893e-06, "loss": 0.73369676, "num_input_tokens_seen": 57400840, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.8125, "step": 2650, "time_per_iteration": 2.391444206237793 }, { "auxiliary_loss_clip": 0.01097245, "auxiliary_loss_mlp": 0.01033142, "balance_loss_clip": 1.01448572, "balance_loss_mlp": 1.02721882, "epoch": 0.15938674282278673, "flos": 13333504118400.0, "grad_norm": 1.84575017835478, "language_loss": 0.71013993, "learning_rate": 3.7547568079145116e-06, "loss": 0.73144376, "num_input_tokens_seen": 57419230, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.69921875, "step": 2651, "time_per_iteration": 2.355515956878662 }, { "auxiliary_loss_clip": 0.01102709, "auxiliary_loss_mlp": 0.01037868, "balance_loss_clip": 1.01563537, "balance_loss_mlp": 1.02707005, "epoch": 0.1594468660754547, "flos": 22487835870720.0, "grad_norm": 1.9484677562824262, "language_loss": 0.79622519, "learning_rate": 3.754575497807278e-06, "loss": 0.81763101, "num_input_tokens_seen": 57439315, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.7578125, "step": 2652, "time_per_iteration": 2.4259936809539795 }, { "auxiliary_loss_clip": 0.01100165, "auxiliary_loss_mlp": 0.01039507, "balance_loss_clip": 1.01925349, "balance_loss_mlp": 1.02832174, "epoch": 0.15950698932812266, "flos": 15006577248000.0, "grad_norm": 2.95076321606993, "language_loss": 0.69801968, "learning_rate": 3.7543941250837578e-06, "loss": 0.71941638, "num_input_tokens_seen": 57454635, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.71875, "step": 2653, "time_per_iteration": 2.325303554534912 }, { "auxiliary_loss_clip": 0.011016, "auxiliary_loss_mlp": 0.0103608, "balance_loss_clip": 1.01498032, "balance_loss_mlp": 1.02698135, "epoch": 0.15956711258079062, "flos": 30153715096320.0, "grad_norm": 2.0871788872937076, "language_loss": 0.77066928, "learning_rate": 3.7542126897504235e-06, "loss": 0.79204607, "num_input_tokens_seen": 57476805, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.74609375, "step": 2654, "time_per_iteration": 2.4599967002868652 }, { "auxiliary_loss_clip": 0.01098148, "auxiliary_loss_mlp": 0.01037046, "balance_loss_clip": 1.01642323, "balance_loss_mlp": 1.02560902, "epoch": 0.1596272358334586, "flos": 21031643306880.0, "grad_norm": 1.8948480458854995, "language_loss": 0.81581485, "learning_rate": 3.754031191813752e-06, "loss": 0.83716679, "num_input_tokens_seen": 57496400, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.7265625, "step": 2655, "time_per_iteration": 2.3683502674102783 }, { "auxiliary_loss_clip": 0.01099878, "auxiliary_loss_mlp": 0.01033338, "balance_loss_clip": 1.01475358, "balance_loss_mlp": 1.02669549, "epoch": 0.15968735908612655, "flos": 15267133791360.0, "grad_norm": 1.9719336390073554, "language_loss": 0.73297918, "learning_rate": 3.753849631280218e-06, "loss": 0.75431132, "num_input_tokens_seen": 57513700, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.73046875, "step": 2656, "time_per_iteration": 2.367332696914673 }, { "auxiliary_loss_clip": 0.01095125, "auxiliary_loss_mlp": 0.01037746, "balance_loss_clip": 1.01929212, "balance_loss_mlp": 1.02536428, "epoch": 0.15974748233879452, "flos": 52663791369600.0, "grad_norm": 2.1035566022409644, "language_loss": 0.77869081, "learning_rate": 3.7536680081563023e-06, "loss": 0.80001956, "num_input_tokens_seen": 57536180, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.69921875, "step": 2657, "time_per_iteration": 2.6401469707489014 }, { "auxiliary_loss_clip": 0.01099954, "auxiliary_loss_mlp": 0.01039954, "balance_loss_clip": 1.0214889, "balance_loss_mlp": 1.02863574, "epoch": 0.1598076055914625, "flos": 18732263564160.0, "grad_norm": 1.7498684972558385, "language_loss": 0.74488926, "learning_rate": 3.753486322448487e-06, "loss": 0.76628828, "num_input_tokens_seen": 57555025, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.71484375, "step": 2658, "time_per_iteration": 2.3692216873168945 }, { "auxiliary_loss_clip": 0.01099768, "auxiliary_loss_mlp": 0.01035301, "balance_loss_clip": 1.01396298, "balance_loss_mlp": 1.02635539, "epoch": 0.15986772884413047, "flos": 34347831390720.0, "grad_norm": 1.7291313209764942, "language_loss": 0.75411272, "learning_rate": 3.753304574163255e-06, "loss": 0.77546334, "num_input_tokens_seen": 57577660, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.734375, "step": 2659, "time_per_iteration": 2.4754250049591064 }, { "auxiliary_loss_clip": 0.01099666, "auxiliary_loss_mlp": 0.01040488, "balance_loss_clip": 1.01914978, "balance_loss_mlp": 1.0264492, "epoch": 0.15992785209679844, "flos": 22053865271040.0, "grad_norm": 1.9408709358154512, "language_loss": 0.90600204, "learning_rate": 3.7531227633070924e-06, "loss": 0.92740357, "num_input_tokens_seen": 57596335, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.734375, "step": 2660, "time_per_iteration": 2.3919284343719482 }, { "auxiliary_loss_clip": 0.01102115, "auxiliary_loss_mlp": 0.01036555, "balance_loss_clip": 1.01618242, "balance_loss_mlp": 1.02813041, "epoch": 0.1599879753494664, "flos": 33065436908160.0, "grad_norm": 1.6439194375650927, "language_loss": 0.77577305, "learning_rate": 3.7529408898864887e-06, "loss": 0.79715973, "num_input_tokens_seen": 57616830, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.7421875, "step": 2661, "time_per_iteration": 3.8710927963256836 }, { "auxiliary_loss_clip": 0.01099562, "auxiliary_loss_mlp": 0.01032414, "balance_loss_clip": 1.01285183, "balance_loss_mlp": 1.02597821, "epoch": 0.16004809860213437, "flos": 28036756540800.0, "grad_norm": 2.155112005459171, "language_loss": 0.74525195, "learning_rate": 3.752758953907933e-06, "loss": 0.7665717, "num_input_tokens_seen": 57635515, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.734375, "step": 2662, "time_per_iteration": 2.450744390487671 }, { "auxiliary_loss_clip": 0.0110121, "auxiliary_loss_mlp": 0.01042846, "balance_loss_clip": 1.02243745, "balance_loss_mlp": 1.02744985, "epoch": 0.16010822185480234, "flos": 22779116277120.0, "grad_norm": 1.948664428006108, "language_loss": 0.82199454, "learning_rate": 3.7525769553779192e-06, "loss": 0.84343511, "num_input_tokens_seen": 57654250, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.73828125, "step": 2663, "time_per_iteration": 3.7367546558380127 }, { "auxiliary_loss_clip": 0.01104684, "auxiliary_loss_mlp": 0.0103922, "balance_loss_clip": 1.01894271, "balance_loss_mlp": 1.02900457, "epoch": 0.16016834510747033, "flos": 20082983310720.0, "grad_norm": 1.9397599102893541, "language_loss": 0.80063188, "learning_rate": 3.7523948943029424e-06, "loss": 0.82207096, "num_input_tokens_seen": 57672645, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.7578125, "step": 2664, "time_per_iteration": 3.814304828643799 }, { "auxiliary_loss_clip": 0.01099688, "auxiliary_loss_mlp": 0.01039963, "balance_loss_clip": 1.01994777, "balance_loss_mlp": 1.02588677, "epoch": 0.1602284683601383, "flos": 21172902134400.0, "grad_norm": 1.604862513318294, "language_loss": 0.93802118, "learning_rate": 3.752212770689499e-06, "loss": 0.9594177, "num_input_tokens_seen": 57691055, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.73828125, "step": 2665, "time_per_iteration": 2.383065938949585 }, { "auxiliary_loss_clip": 0.01101918, "auxiliary_loss_mlp": 0.01034422, "balance_loss_clip": 1.01403761, "balance_loss_mlp": 1.02683282, "epoch": 0.16028859161280626, "flos": 14646692286720.0, "grad_norm": 2.328704262901842, "language_loss": 0.84797919, "learning_rate": 3.752030584544089e-06, "loss": 0.86934257, "num_input_tokens_seen": 57707235, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.75, "step": 2666, "time_per_iteration": 3.7724173069000244 }, { "auxiliary_loss_clip": 0.01099267, "auxiliary_loss_mlp": 0.01039921, "balance_loss_clip": 1.02032328, "balance_loss_mlp": 1.02699661, "epoch": 0.16034871486547422, "flos": 20989433606400.0, "grad_norm": 2.2293109702379645, "language_loss": 0.81689608, "learning_rate": 3.7518483358732142e-06, "loss": 0.83828795, "num_input_tokens_seen": 57724190, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.72265625, "step": 2667, "time_per_iteration": 2.3819265365600586 }, { "auxiliary_loss_clip": 0.01100691, "auxiliary_loss_mlp": 0.01045485, "balance_loss_clip": 1.02395546, "balance_loss_mlp": 1.02797079, "epoch": 0.1604088381181422, "flos": 21396660238080.0, "grad_norm": 2.223020140601549, "language_loss": 0.74172294, "learning_rate": 3.751666024683379e-06, "loss": 0.76318473, "num_input_tokens_seen": 57743620, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.7265625, "step": 2668, "time_per_iteration": 2.4038949012756348 }, { "auxiliary_loss_clip": 0.01101106, "auxiliary_loss_mlp": 0.01041508, "balance_loss_clip": 1.02031231, "balance_loss_mlp": 1.02621579, "epoch": 0.16046896137081015, "flos": 23875947550080.0, "grad_norm": 1.5919316620720776, "language_loss": 0.77043045, "learning_rate": 3.751483650981089e-06, "loss": 0.79185653, "num_input_tokens_seen": 57764810, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.75, "step": 2669, "time_per_iteration": 2.435511350631714 }, { "auxiliary_loss_clip": 0.01026355, "auxiliary_loss_mlp": 0.01005916, "balance_loss_clip": 1.00378191, "balance_loss_mlp": 1.00718212, "epoch": 0.16052908462347812, "flos": 59803842608640.0, "grad_norm": 0.8001425358573404, "language_loss": 0.55502141, "learning_rate": 3.7513012147728527e-06, "loss": 0.57534409, "num_input_tokens_seen": 57824390, "router_z_loss_clip": 0.0213623, "router_z_loss_mlp": 0.19140625, "step": 2670, "time_per_iteration": 2.9251914024353027 }, { "auxiliary_loss_clip": 0.011007, "auxiliary_loss_mlp": 0.01038048, "balance_loss_clip": 1.01755571, "balance_loss_mlp": 1.02633071, "epoch": 0.1605892078761461, "flos": 18295569878400.0, "grad_norm": 1.9109288235358965, "language_loss": 0.77216643, "learning_rate": 3.751118716065181e-06, "loss": 0.79355395, "num_input_tokens_seen": 57843665, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.7421875, "step": 2671, "time_per_iteration": 2.3948540687561035 }, { "auxiliary_loss_clip": 0.01101305, "auxiliary_loss_mlp": 0.01034247, "balance_loss_clip": 1.01454139, "balance_loss_mlp": 1.02747202, "epoch": 0.16064933112881408, "flos": 32159370637440.0, "grad_norm": 2.0845246797628487, "language_loss": 0.65131581, "learning_rate": 3.750936154864587e-06, "loss": 0.67267138, "num_input_tokens_seen": 57863305, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.73828125, "step": 2672, "time_per_iteration": 2.483060598373413 }, { "auxiliary_loss_clip": 0.01101782, "auxiliary_loss_mlp": 0.01037121, "balance_loss_clip": 1.0153178, "balance_loss_mlp": 1.02615297, "epoch": 0.16070945438148204, "flos": 19827768205440.0, "grad_norm": 2.084943627089922, "language_loss": 0.85613823, "learning_rate": 3.750753531177586e-06, "loss": 0.8775273, "num_input_tokens_seen": 57883025, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.7578125, "step": 2673, "time_per_iteration": 2.3847758769989014 }, { "auxiliary_loss_clip": 0.01102021, "auxiliary_loss_mlp": 0.01043887, "balance_loss_clip": 1.02426529, "balance_loss_mlp": 1.02886093, "epoch": 0.16076957763415, "flos": 18912240956160.0, "grad_norm": 2.5506654545037857, "language_loss": 0.73004067, "learning_rate": 3.750570845010694e-06, "loss": 0.75149977, "num_input_tokens_seen": 57901430, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.73046875, "step": 2674, "time_per_iteration": 2.352660894393921 }, { "auxiliary_loss_clip": 0.01099736, "auxiliary_loss_mlp": 0.01037093, "balance_loss_clip": 1.01462245, "balance_loss_mlp": 1.0261476, "epoch": 0.16082970088681797, "flos": 16763406462720.0, "grad_norm": 1.5455839925373789, "language_loss": 0.8386209, "learning_rate": 3.7503880963704314e-06, "loss": 0.85998923, "num_input_tokens_seen": 57919550, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.734375, "step": 2675, "time_per_iteration": 2.3616671562194824 }, { "auxiliary_loss_clip": 0.01103789, "auxiliary_loss_mlp": 0.01039682, "balance_loss_clip": 1.018749, "balance_loss_mlp": 1.02890682, "epoch": 0.16088982413948594, "flos": 35148878691840.0, "grad_norm": 1.8855339884645612, "language_loss": 0.82327354, "learning_rate": 3.7502052852633206e-06, "loss": 0.84470832, "num_input_tokens_seen": 57939890, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.75, "step": 2676, "time_per_iteration": 2.47837233543396 }, { "auxiliary_loss_clip": 0.01099421, "auxiliary_loss_mlp": 0.01035022, "balance_loss_clip": 1.01689005, "balance_loss_mlp": 1.02847695, "epoch": 0.1609499473921539, "flos": 18624102572160.0, "grad_norm": 2.4144862954961335, "language_loss": 0.73110569, "learning_rate": 3.7500224116958856e-06, "loss": 0.75245011, "num_input_tokens_seen": 57957410, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.7109375, "step": 2677, "time_per_iteration": 2.363727569580078 }, { "auxiliary_loss_clip": 0.01096931, "auxiliary_loss_mlp": 0.01035353, "balance_loss_clip": 1.01620793, "balance_loss_mlp": 1.02643561, "epoch": 0.1610100706448219, "flos": 33144340314240.0, "grad_norm": 1.700185238420181, "language_loss": 0.7650227, "learning_rate": 3.7498394756746522e-06, "loss": 0.78634554, "num_input_tokens_seen": 57977900, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.703125, "step": 2678, "time_per_iteration": 2.464813232421875 }, { "auxiliary_loss_clip": 0.01101983, "auxiliary_loss_mlp": 0.01036238, "balance_loss_clip": 1.01424408, "balance_loss_mlp": 1.0274899, "epoch": 0.16107019389748986, "flos": 34675316743680.0, "grad_norm": 1.8492951103689623, "language_loss": 0.70696336, "learning_rate": 3.749656477206149e-06, "loss": 0.72834557, "num_input_tokens_seen": 57998210, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.7421875, "step": 2679, "time_per_iteration": 2.497814893722534 }, { "auxiliary_loss_clip": 0.01025193, "auxiliary_loss_mlp": 0.01011856, "balance_loss_clip": 1.00947165, "balance_loss_mlp": 1.00566459, "epoch": 0.16113031715015783, "flos": 65710483735680.0, "grad_norm": 0.7914817310226664, "language_loss": 0.51820886, "learning_rate": 3.749473416296906e-06, "loss": 0.53857934, "num_input_tokens_seen": 58059420, "router_z_loss_clip": 0.02380371, "router_z_loss_mlp": 0.1953125, "step": 2680, "time_per_iteration": 3.0602798461914062 }, { "auxiliary_loss_clip": 0.01100346, "auxiliary_loss_mlp": 0.01039493, "balance_loss_clip": 1.01643753, "balance_loss_mlp": 1.0259409, "epoch": 0.1611904404028258, "flos": 20809456214400.0, "grad_norm": 1.8493612624825924, "language_loss": 0.80466175, "learning_rate": 3.749290292953458e-06, "loss": 0.82606018, "num_input_tokens_seen": 58078370, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.7421875, "step": 2681, "time_per_iteration": 2.3731982707977295 }, { "auxiliary_loss_clip": 0.01098794, "auxiliary_loss_mlp": 0.01043045, "balance_loss_clip": 1.02272022, "balance_loss_mlp": 1.02759242, "epoch": 0.16125056365549376, "flos": 27012195515520.0, "grad_norm": 1.9213457875697393, "language_loss": 0.68854344, "learning_rate": 3.749107107182339e-06, "loss": 0.70996189, "num_input_tokens_seen": 58097395, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.7109375, "step": 2682, "time_per_iteration": 2.4167189598083496 }, { "auxiliary_loss_clip": 0.01102691, "auxiliary_loss_mlp": 0.010446, "balance_loss_clip": 1.02196264, "balance_loss_mlp": 1.02975488, "epoch": 0.16131068690816172, "flos": 19275651964800.0, "grad_norm": 2.0126592915452126, "language_loss": 0.87158656, "learning_rate": 3.7489238589900855e-06, "loss": 0.89305949, "num_input_tokens_seen": 58115630, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.73046875, "step": 2683, "time_per_iteration": 2.3900914192199707 }, { "auxiliary_loss_clip": 0.01103146, "auxiliary_loss_mlp": 0.0104813, "balance_loss_clip": 1.02664828, "balance_loss_mlp": 1.02847111, "epoch": 0.16137081016082971, "flos": 35336396937600.0, "grad_norm": 1.9732873598850735, "language_loss": 0.74236965, "learning_rate": 3.7487405483832395e-06, "loss": 0.7638824, "num_input_tokens_seen": 58138655, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.74609375, "step": 2684, "time_per_iteration": 2.5172033309936523 }, { "auxiliary_loss_clip": 0.01107227, "auxiliary_loss_mlp": 0.01043533, "balance_loss_clip": 1.02189636, "balance_loss_mlp": 1.03007555, "epoch": 0.16143093341349768, "flos": 34233979847040.0, "grad_norm": 3.002548692009738, "language_loss": 0.70575935, "learning_rate": 3.748557175368341e-06, "loss": 0.72726703, "num_input_tokens_seen": 58157440, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.7734375, "step": 2685, "time_per_iteration": 2.5088298320770264 }, { "auxiliary_loss_clip": 0.01097151, "auxiliary_loss_mlp": 0.01038804, "balance_loss_clip": 1.01793098, "balance_loss_mlp": 1.02681971, "epoch": 0.16149105666616564, "flos": 27998072887680.0, "grad_norm": 1.8848704834199657, "language_loss": 0.716102, "learning_rate": 3.748373739951935e-06, "loss": 0.73746157, "num_input_tokens_seen": 58176660, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.703125, "step": 2686, "time_per_iteration": 2.4251043796539307 }, { "auxiliary_loss_clip": 0.01103306, "auxiliary_loss_mlp": 0.01045832, "balance_loss_clip": 1.0247798, "balance_loss_mlp": 1.03054428, "epoch": 0.1615511799188336, "flos": 19421344535040.0, "grad_norm": 2.039853388909567, "language_loss": 0.81668341, "learning_rate": 3.7481902421405676e-06, "loss": 0.83817482, "num_input_tokens_seen": 58195085, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.7265625, "step": 2687, "time_per_iteration": 2.3885385990142822 }, { "auxiliary_loss_clip": 0.01106753, "auxiliary_loss_mlp": 0.01043671, "balance_loss_clip": 1.01904213, "balance_loss_mlp": 1.02707458, "epoch": 0.16161130317150157, "flos": 22853865231360.0, "grad_norm": 1.776605083139513, "language_loss": 0.71621692, "learning_rate": 3.7480066819407876e-06, "loss": 0.7377212, "num_input_tokens_seen": 58213540, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 0.796875, "step": 2688, "time_per_iteration": 2.383474349975586 }, { "auxiliary_loss_clip": 0.01101855, "auxiliary_loss_mlp": 0.01042002, "balance_loss_clip": 1.02196348, "balance_loss_mlp": 1.02826595, "epoch": 0.16167142642416954, "flos": 26109201444480.0, "grad_norm": 3.3158671752621918, "language_loss": 0.75798613, "learning_rate": 3.7478230593591448e-06, "loss": 0.77942467, "num_input_tokens_seen": 58236995, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.734375, "step": 2689, "time_per_iteration": 2.491985321044922 }, { "auxiliary_loss_clip": 0.01101707, "auxiliary_loss_mlp": 0.01040813, "balance_loss_clip": 1.01946294, "balance_loss_mlp": 1.02907157, "epoch": 0.1617315496768375, "flos": 22778662429440.0, "grad_norm": 1.83149192820103, "language_loss": 0.87536496, "learning_rate": 3.747639374402193e-06, "loss": 0.89679015, "num_input_tokens_seen": 58257230, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.7265625, "step": 2690, "time_per_iteration": 2.417999029159546 }, { "auxiliary_loss_clip": 0.01099128, "auxiliary_loss_mlp": 0.01040434, "balance_loss_clip": 1.02136016, "balance_loss_mlp": 1.02734601, "epoch": 0.1617916729295055, "flos": 22016228958720.0, "grad_norm": 1.8029494924009606, "language_loss": 0.88038915, "learning_rate": 3.7474556270764877e-06, "loss": 0.90178472, "num_input_tokens_seen": 58277080, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.71875, "step": 2691, "time_per_iteration": 2.3933136463165283 }, { "auxiliary_loss_clip": 0.01106955, "auxiliary_loss_mlp": 0.0105175, "balance_loss_clip": 1.02844501, "balance_loss_mlp": 1.02773464, "epoch": 0.16185179618217346, "flos": 23437194094080.0, "grad_norm": 2.117898701803228, "language_loss": 0.82161796, "learning_rate": 3.7472718173885864e-06, "loss": 0.84320498, "num_input_tokens_seen": 58294815, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.79296875, "step": 2692, "time_per_iteration": 2.3905084133148193 }, { "auxiliary_loss_clip": 0.01105115, "auxiliary_loss_mlp": 0.01043661, "balance_loss_clip": 1.02046311, "balance_loss_mlp": 1.02833152, "epoch": 0.16191191943484143, "flos": 25664931993600.0, "grad_norm": 2.1941735514482166, "language_loss": 0.81331909, "learning_rate": 3.747087945345048e-06, "loss": 0.83480686, "num_input_tokens_seen": 58313215, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.765625, "step": 2693, "time_per_iteration": 2.413591146469116 }, { "auxiliary_loss_clip": 0.01100085, "auxiliary_loss_mlp": 0.01043148, "balance_loss_clip": 1.02325225, "balance_loss_mlp": 1.02796292, "epoch": 0.1619720426875094, "flos": 23476226860800.0, "grad_norm": 1.4848001684834402, "language_loss": 0.83649707, "learning_rate": 3.746904010952435e-06, "loss": 0.85792935, "num_input_tokens_seen": 58333215, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.71875, "step": 2694, "time_per_iteration": 2.411134958267212 }, { "auxiliary_loss_clip": 0.01107016, "auxiliary_loss_mlp": 0.01046708, "balance_loss_clip": 1.02434468, "balance_loss_mlp": 1.02975321, "epoch": 0.16203216594017736, "flos": 24132524198400.0, "grad_norm": 1.9641912622310724, "language_loss": 0.69131589, "learning_rate": 3.7467200142173114e-06, "loss": 0.71285313, "num_input_tokens_seen": 58351160, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 0.7734375, "step": 2695, "time_per_iteration": 2.4020485877990723 }, { "auxiliary_loss_clip": 0.01106137, "auxiliary_loss_mlp": 0.0104104, "balance_loss_clip": 1.01900971, "balance_loss_mlp": 1.02939367, "epoch": 0.16209228919284532, "flos": 22339943884800.0, "grad_norm": 2.057423905456492, "language_loss": 0.82545096, "learning_rate": 3.7465359551462438e-06, "loss": 0.84692276, "num_input_tokens_seen": 58368505, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.765625, "step": 2696, "time_per_iteration": 2.3817596435546875 }, { "auxiliary_loss_clip": 0.01108746, "auxiliary_loss_mlp": 0.01043349, "balance_loss_clip": 1.02006698, "balance_loss_mlp": 1.0291909, "epoch": 0.1621524124455133, "flos": 15814222796160.0, "grad_norm": 2.223793290993585, "language_loss": 0.88445479, "learning_rate": 3.7463518337458006e-06, "loss": 0.90597576, "num_input_tokens_seen": 58385085, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 0.796875, "step": 2697, "time_per_iteration": 2.373197317123413 }, { "auxiliary_loss_clip": 0.01097182, "auxiliary_loss_mlp": 0.01035197, "balance_loss_clip": 1.01657605, "balance_loss_mlp": 1.02738893, "epoch": 0.16221253569818128, "flos": 30185486300160.0, "grad_norm": 1.4813412941126236, "language_loss": 0.80739617, "learning_rate": 3.7461676500225522e-06, "loss": 0.82871991, "num_input_tokens_seen": 58406985, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.69921875, "step": 2698, "time_per_iteration": 2.4726316928863525 }, { "auxiliary_loss_clip": 0.01098231, "auxiliary_loss_mlp": 0.010466, "balance_loss_clip": 1.02551174, "balance_loss_mlp": 1.02775669, "epoch": 0.16227265895084925, "flos": 24604899160320.0, "grad_norm": 1.7044726119749638, "language_loss": 0.77323377, "learning_rate": 3.7459834039830726e-06, "loss": 0.79468215, "num_input_tokens_seen": 58426205, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.703125, "step": 2699, "time_per_iteration": 2.424938678741455 }, { "auxiliary_loss_clip": 0.01099858, "auxiliary_loss_mlp": 0.01036946, "balance_loss_clip": 1.01789641, "balance_loss_mlp": 1.02728641, "epoch": 0.1623327822035172, "flos": 19572308720640.0, "grad_norm": 2.732572726447219, "language_loss": 0.85681903, "learning_rate": 3.745799095633936e-06, "loss": 0.87818706, "num_input_tokens_seen": 58443830, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.7265625, "step": 2700, "time_per_iteration": 3.7618701457977295 }, { "auxiliary_loss_clip": 0.01099364, "auxiliary_loss_mlp": 0.01043829, "balance_loss_clip": 1.02216864, "balance_loss_mlp": 1.0269953, "epoch": 0.16239290545618518, "flos": 26467271015040.0, "grad_norm": 3.9493981243958216, "language_loss": 0.8032552, "learning_rate": 3.7456147249817203e-06, "loss": 0.82468712, "num_input_tokens_seen": 58464405, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.72265625, "step": 2701, "time_per_iteration": 2.429082155227661 }, { "auxiliary_loss_clip": 0.01101654, "auxiliary_loss_mlp": 0.01040281, "balance_loss_clip": 1.01992059, "balance_loss_mlp": 1.02973723, "epoch": 0.16245302870885314, "flos": 15851021235840.0, "grad_norm": 1.9193976785875857, "language_loss": 0.73022813, "learning_rate": 3.745430292033006e-06, "loss": 0.75164747, "num_input_tokens_seen": 58483295, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.71875, "step": 2702, "time_per_iteration": 3.7418477535247803 }, { "auxiliary_loss_clip": 0.01101791, "auxiliary_loss_mlp": 0.01042486, "balance_loss_clip": 1.02016985, "balance_loss_mlp": 1.02802634, "epoch": 0.1625131519615211, "flos": 14755656240000.0, "grad_norm": 2.2755867486125743, "language_loss": 0.72900951, "learning_rate": 3.745245796794374e-06, "loss": 0.75045222, "num_input_tokens_seen": 58501205, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.73828125, "step": 2703, "time_per_iteration": 3.726701021194458 }, { "auxiliary_loss_clip": 0.01102086, "auxiliary_loss_mlp": 0.01038425, "balance_loss_clip": 1.01637101, "balance_loss_mlp": 1.02656317, "epoch": 0.1625732752141891, "flos": 28219247550720.0, "grad_norm": 2.2852940608047865, "language_loss": 0.70878398, "learning_rate": 3.7450612392724084e-06, "loss": 0.73018903, "num_input_tokens_seen": 58522315, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.75390625, "step": 2704, "time_per_iteration": 2.432807683944702 }, { "auxiliary_loss_clip": 0.01026434, "auxiliary_loss_mlp": 0.01003702, "balance_loss_clip": 1.00136578, "balance_loss_mlp": 1.00683141, "epoch": 0.16263339846685707, "flos": 67324727491200.0, "grad_norm": 0.7802057449767931, "language_loss": 0.53309071, "learning_rate": 3.7448766194736967e-06, "loss": 0.55339205, "num_input_tokens_seen": 58586695, "router_z_loss_clip": 0.02331543, "router_z_loss_mlp": 0.19628906, "step": 2705, "time_per_iteration": 3.03957462310791 }, { "auxiliary_loss_clip": 0.01103926, "auxiliary_loss_mlp": 0.01042252, "balance_loss_clip": 1.02042484, "balance_loss_mlp": 1.02761436, "epoch": 0.16269352171952503, "flos": 14318299238400.0, "grad_norm": 2.8690243218573026, "language_loss": 0.75459617, "learning_rate": 3.7446919374048265e-06, "loss": 0.77605796, "num_input_tokens_seen": 58602435, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.765625, "step": 2706, "time_per_iteration": 3.706674814224243 }, { "auxiliary_loss_clip": 0.01101461, "auxiliary_loss_mlp": 0.01033724, "balance_loss_clip": 1.01388717, "balance_loss_mlp": 1.02723992, "epoch": 0.162753644972193, "flos": 28360087441920.0, "grad_norm": 1.8757145565592657, "language_loss": 0.72222096, "learning_rate": 3.7445071930723888e-06, "loss": 0.74357283, "num_input_tokens_seen": 58621275, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.7421875, "step": 2707, "time_per_iteration": 2.4413931369781494 }, { "auxiliary_loss_clip": 0.01103872, "auxiliary_loss_mlp": 0.01043134, "balance_loss_clip": 1.0217123, "balance_loss_mlp": 1.02834046, "epoch": 0.16281376822486096, "flos": 19936836892800.0, "grad_norm": 2.571560648137463, "language_loss": 0.83571339, "learning_rate": 3.7443223864829773e-06, "loss": 0.85718346, "num_input_tokens_seen": 58637550, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.75390625, "step": 2708, "time_per_iteration": 2.38787579536438 }, { "auxiliary_loss_clip": 0.01107758, "auxiliary_loss_mlp": 0.01043605, "balance_loss_clip": 1.01953697, "balance_loss_mlp": 1.02829206, "epoch": 0.16287389147752893, "flos": 21250653465600.0, "grad_norm": 2.0981569629576327, "language_loss": 0.86046529, "learning_rate": 3.7441375176431863e-06, "loss": 0.88197893, "num_input_tokens_seen": 58654135, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 0.796875, "step": 2709, "time_per_iteration": 2.376641273498535 }, { "auxiliary_loss_clip": 0.01101736, "auxiliary_loss_mlp": 0.01040952, "balance_loss_clip": 1.02080548, "balance_loss_mlp": 1.02731037, "epoch": 0.1629340147301969, "flos": 19243671292800.0, "grad_norm": 1.6395461203937707, "language_loss": 0.91247582, "learning_rate": 3.7439525865596137e-06, "loss": 0.93390268, "num_input_tokens_seen": 58674320, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.7421875, "step": 2710, "time_per_iteration": 2.399232864379883 }, { "auxiliary_loss_clip": 0.01102421, "auxiliary_loss_mlp": 0.01043115, "balance_loss_clip": 1.02130008, "balance_loss_mlp": 1.02973938, "epoch": 0.16299413798286488, "flos": 21248803163520.0, "grad_norm": 2.4521110439754237, "language_loss": 0.81027466, "learning_rate": 3.7437675932388596e-06, "loss": 0.83173001, "num_input_tokens_seen": 58691000, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.7265625, "step": 2711, "time_per_iteration": 2.367405891418457 }, { "auxiliary_loss_clip": 0.01104288, "auxiliary_loss_mlp": 0.01039016, "balance_loss_clip": 1.01658082, "balance_loss_mlp": 1.02587044, "epoch": 0.16305426123553285, "flos": 18769585674240.0, "grad_norm": 2.1562163814010247, "language_loss": 0.8089633, "learning_rate": 3.7435825376875253e-06, "loss": 0.83039629, "num_input_tokens_seen": 58710230, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.78125, "step": 2712, "time_per_iteration": 2.3526713848114014 }, { "auxiliary_loss_clip": 0.0110389, "auxiliary_loss_mlp": 0.01043683, "balance_loss_clip": 1.02184391, "balance_loss_mlp": 1.02715445, "epoch": 0.16311438448820081, "flos": 22086648904320.0, "grad_norm": 1.8909279713995406, "language_loss": 0.77136874, "learning_rate": 3.743397419912215e-06, "loss": 0.79284441, "num_input_tokens_seen": 58728610, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.765625, "step": 2713, "time_per_iteration": 2.3635666370391846 }, { "auxiliary_loss_clip": 0.01103318, "auxiliary_loss_mlp": 0.01045318, "balance_loss_clip": 1.02394414, "balance_loss_mlp": 1.03042424, "epoch": 0.16317450774086878, "flos": 16466889352320.0, "grad_norm": 2.6818998512786365, "language_loss": 0.7886489, "learning_rate": 3.7432122399195365e-06, "loss": 0.81013525, "num_input_tokens_seen": 58744385, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.7265625, "step": 2714, "time_per_iteration": 2.347163200378418 }, { "auxiliary_loss_clip": 0.0110423, "auxiliary_loss_mlp": 0.01040664, "balance_loss_clip": 1.02042222, "balance_loss_mlp": 1.02938485, "epoch": 0.16323463099353674, "flos": 24351778736640.0, "grad_norm": 1.650219947186336, "language_loss": 0.77981454, "learning_rate": 3.7430269977160956e-06, "loss": 0.80126345, "num_input_tokens_seen": 58763905, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.75, "step": 2715, "time_per_iteration": 2.402946949005127 }, { "auxiliary_loss_clip": 0.01099133, "auxiliary_loss_mlp": 0.0103536, "balance_loss_clip": 1.01517773, "balance_loss_mlp": 1.0264461, "epoch": 0.1632947542462047, "flos": 24899600880000.0, "grad_norm": 2.4567311825744897, "language_loss": 0.82195216, "learning_rate": 3.742841693308506e-06, "loss": 0.84329712, "num_input_tokens_seen": 58785580, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.7265625, "step": 2716, "time_per_iteration": 2.417941093444824 }, { "auxiliary_loss_clip": 0.01105238, "auxiliary_loss_mlp": 0.01042246, "balance_loss_clip": 1.02075291, "balance_loss_mlp": 1.0306592, "epoch": 0.1633548774988727, "flos": 24899112120960.0, "grad_norm": 1.936804784333361, "language_loss": 0.86132491, "learning_rate": 3.742656326703379e-06, "loss": 0.88279974, "num_input_tokens_seen": 58806075, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.74609375, "step": 2717, "time_per_iteration": 2.3981850147247314 }, { "auxiliary_loss_clip": 0.01100908, "auxiliary_loss_mlp": 0.01038622, "balance_loss_clip": 1.01877379, "balance_loss_mlp": 1.0287087, "epoch": 0.16341500075154067, "flos": 30440596671360.0, "grad_norm": 1.706598655723777, "language_loss": 0.76384556, "learning_rate": 3.7424708979073306e-06, "loss": 0.78524089, "num_input_tokens_seen": 58827405, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.72265625, "step": 2718, "time_per_iteration": 2.4446098804473877 }, { "auxiliary_loss_clip": 0.01102697, "auxiliary_loss_mlp": 0.01037988, "balance_loss_clip": 1.0174011, "balance_loss_mlp": 1.02754319, "epoch": 0.16347512400420863, "flos": 22783410374400.0, "grad_norm": 1.9844112790707533, "language_loss": 0.73798156, "learning_rate": 3.742285406926978e-06, "loss": 0.75938845, "num_input_tokens_seen": 58847205, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.75, "step": 2719, "time_per_iteration": 2.3830654621124268 }, { "auxiliary_loss_clip": 0.0110243, "auxiliary_loss_mlp": 0.0103931, "balance_loss_clip": 1.01919889, "balance_loss_mlp": 1.02733767, "epoch": 0.1635352472568766, "flos": 22632306543360.0, "grad_norm": 1.6711539850838706, "language_loss": 0.72027409, "learning_rate": 3.7420998537689402e-06, "loss": 0.74169153, "num_input_tokens_seen": 58866865, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.75, "step": 2720, "time_per_iteration": 2.3898520469665527 }, { "auxiliary_loss_clip": 0.01099884, "auxiliary_loss_mlp": 0.01038156, "balance_loss_clip": 1.01648426, "balance_loss_mlp": 1.02837658, "epoch": 0.16359537050954456, "flos": 15522104517120.0, "grad_norm": 2.0201840408562517, "language_loss": 0.75201935, "learning_rate": 3.7419142384398404e-06, "loss": 0.77339977, "num_input_tokens_seen": 58885200, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.71484375, "step": 2721, "time_per_iteration": 2.3708364963531494 }, { "auxiliary_loss_clip": 0.01102009, "auxiliary_loss_mlp": 0.01038147, "balance_loss_clip": 1.01720154, "balance_loss_mlp": 1.02607942, "epoch": 0.16365549376221253, "flos": 22089092699520.0, "grad_norm": 2.0125671119970114, "language_loss": 0.79488349, "learning_rate": 3.7417285609463026e-06, "loss": 0.81628501, "num_input_tokens_seen": 58906385, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.7578125, "step": 2722, "time_per_iteration": 2.408007860183716 }, { "auxiliary_loss_clip": 0.01104363, "auxiliary_loss_mlp": 0.01044011, "balance_loss_clip": 1.02027655, "balance_loss_mlp": 1.02764964, "epoch": 0.1637156170148805, "flos": 24059276432640.0, "grad_norm": 3.464672780683019, "language_loss": 0.84411418, "learning_rate": 3.7415428212949524e-06, "loss": 0.86559796, "num_input_tokens_seen": 58925040, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 0.765625, "step": 2723, "time_per_iteration": 2.450230598449707 }, { "auxiliary_loss_clip": 0.01097816, "auxiliary_loss_mlp": 0.01037045, "balance_loss_clip": 1.0164578, "balance_loss_mlp": 1.02698934, "epoch": 0.1637757402675485, "flos": 26684221403520.0, "grad_norm": 6.8876305134667035, "language_loss": 0.71284223, "learning_rate": 3.7413570194924183e-06, "loss": 0.73419076, "num_input_tokens_seen": 58944790, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.7109375, "step": 2724, "time_per_iteration": 2.427006721496582 }, { "auxiliary_loss_clip": 0.01097508, "auxiliary_loss_mlp": 0.01036383, "balance_loss_clip": 1.01665354, "balance_loss_mlp": 1.02673888, "epoch": 0.16383586352021645, "flos": 16106026872960.0, "grad_norm": 2.22160942867275, "language_loss": 0.70896482, "learning_rate": 3.741171155545332e-06, "loss": 0.73030376, "num_input_tokens_seen": 58962500, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.70703125, "step": 2725, "time_per_iteration": 2.3558108806610107 }, { "auxiliary_loss_clip": 0.01097869, "auxiliary_loss_mlp": 0.01036382, "balance_loss_clip": 1.01664102, "balance_loss_mlp": 1.02822781, "epoch": 0.16389598677288442, "flos": 19165151911680.0, "grad_norm": 2.929731921097319, "language_loss": 0.88497961, "learning_rate": 3.7409852294603255e-06, "loss": 0.90632212, "num_input_tokens_seen": 58980355, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.6953125, "step": 2726, "time_per_iteration": 2.37483549118042 }, { "auxiliary_loss_clip": 0.01105815, "auxiliary_loss_mlp": 0.01041284, "balance_loss_clip": 1.02011204, "balance_loss_mlp": 1.03108537, "epoch": 0.16395611002555238, "flos": 21505938393600.0, "grad_norm": 1.9957275556100098, "language_loss": 0.74080288, "learning_rate": 3.740799241244035e-06, "loss": 0.76227391, "num_input_tokens_seen": 58999505, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.75, "step": 2727, "time_per_iteration": 2.384979009628296 }, { "auxiliary_loss_clip": 0.01097125, "auxiliary_loss_mlp": 0.01038954, "balance_loss_clip": 1.01969004, "balance_loss_mlp": 1.02770567, "epoch": 0.16401623327822035, "flos": 21469838181120.0, "grad_norm": 1.7100790204559277, "language_loss": 0.82165432, "learning_rate": 3.7406131909030972e-06, "loss": 0.84301507, "num_input_tokens_seen": 59017930, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.6953125, "step": 2728, "time_per_iteration": 2.406728982925415 }, { "auxiliary_loss_clip": 0.01102966, "auxiliary_loss_mlp": 0.01040676, "balance_loss_clip": 1.02002883, "balance_loss_mlp": 1.02820706, "epoch": 0.1640763565308883, "flos": 13625378017920.0, "grad_norm": 7.722774197557603, "language_loss": 0.85067058, "learning_rate": 3.740427078444152e-06, "loss": 0.87210703, "num_input_tokens_seen": 59035130, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.74609375, "step": 2729, "time_per_iteration": 2.3586716651916504 }, { "auxiliary_loss_clip": 0.01100578, "auxiliary_loss_mlp": 0.01044189, "balance_loss_clip": 1.02415013, "balance_loss_mlp": 1.02738476, "epoch": 0.16413647978355628, "flos": 15450532496640.0, "grad_norm": 2.248498291358936, "language_loss": 0.72755969, "learning_rate": 3.7402409038738416e-06, "loss": 0.74900734, "num_input_tokens_seen": 59053080, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.73046875, "step": 2730, "time_per_iteration": 2.357088804244995 }, { "auxiliary_loss_clip": 0.01102125, "auxiliary_loss_mlp": 0.01043043, "balance_loss_clip": 1.02051187, "balance_loss_mlp": 1.0260247, "epoch": 0.16419660303622427, "flos": 45876955155840.0, "grad_norm": 1.7213639834879002, "language_loss": 0.74439585, "learning_rate": 3.7400546671988096e-06, "loss": 0.76584756, "num_input_tokens_seen": 59075610, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.76171875, "step": 2731, "time_per_iteration": 2.5883896350860596 }, { "auxiliary_loss_clip": 0.01105179, "auxiliary_loss_mlp": 0.0103957, "balance_loss_clip": 1.01845753, "balance_loss_mlp": 1.02862, "epoch": 0.16425672628889224, "flos": 18951832304640.0, "grad_norm": 2.8613939021894943, "language_loss": 0.79236877, "learning_rate": 3.739868368425702e-06, "loss": 0.81381625, "num_input_tokens_seen": 59094555, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.765625, "step": 2732, "time_per_iteration": 2.36692476272583 }, { "auxiliary_loss_clip": 0.0110444, "auxiliary_loss_mlp": 0.01039292, "balance_loss_clip": 1.01827526, "balance_loss_mlp": 1.02979863, "epoch": 0.1643168495415602, "flos": 24311943008640.0, "grad_norm": 2.5706923919132962, "language_loss": 0.69387078, "learning_rate": 3.7396820075611682e-06, "loss": 0.71530807, "num_input_tokens_seen": 59113515, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.74609375, "step": 2733, "time_per_iteration": 2.3920483589172363 }, { "auxiliary_loss_clip": 0.01103058, "auxiliary_loss_mlp": 0.01040968, "balance_loss_clip": 1.01924813, "balance_loss_mlp": 1.02886534, "epoch": 0.16437697279422817, "flos": 26427330552960.0, "grad_norm": 2.0778114265675827, "language_loss": 0.81116164, "learning_rate": 3.7394955846118585e-06, "loss": 0.83260185, "num_input_tokens_seen": 59133275, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 0.7421875, "step": 2734, "time_per_iteration": 2.4219093322753906 }, { "auxiliary_loss_clip": 0.01100096, "auxiliary_loss_mlp": 0.01037873, "balance_loss_clip": 1.01756001, "balance_loss_mlp": 1.02732301, "epoch": 0.16443709604689613, "flos": 34530811159680.0, "grad_norm": 2.1915788350221095, "language_loss": 0.82217395, "learning_rate": 3.739309099584426e-06, "loss": 0.84355366, "num_input_tokens_seen": 59154095, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.7265625, "step": 2735, "time_per_iteration": 2.478828191757202 }, { "auxiliary_loss_clip": 0.01099139, "auxiliary_loss_mlp": 0.01036727, "balance_loss_clip": 1.01740348, "balance_loss_mlp": 1.0274241, "epoch": 0.1644972192995641, "flos": 23256937411200.0, "grad_norm": 3.0648714549534146, "language_loss": 0.78555602, "learning_rate": 3.7391225524855256e-06, "loss": 0.80691475, "num_input_tokens_seen": 59173795, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.71875, "step": 2736, "time_per_iteration": 2.388718605041504 }, { "auxiliary_loss_clip": 0.01103637, "auxiliary_loss_mlp": 0.01040628, "balance_loss_clip": 1.0214355, "balance_loss_mlp": 1.03016293, "epoch": 0.1645573425522321, "flos": 26978329630080.0, "grad_norm": 1.7862917661888835, "language_loss": 0.81538427, "learning_rate": 3.738935943321815e-06, "loss": 0.83682692, "num_input_tokens_seen": 59191610, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.734375, "step": 2737, "time_per_iteration": 2.411057472229004 }, { "auxiliary_loss_clip": 0.0110028, "auxiliary_loss_mlp": 0.01038799, "balance_loss_clip": 1.01886678, "balance_loss_mlp": 1.02628779, "epoch": 0.16461746580490005, "flos": 28730480722560.0, "grad_norm": 1.9910116654991181, "language_loss": 0.87328762, "learning_rate": 3.7387492720999536e-06, "loss": 0.89467835, "num_input_tokens_seen": 59213000, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.7421875, "step": 2738, "time_per_iteration": 2.4444847106933594 }, { "auxiliary_loss_clip": 0.01099441, "auxiliary_loss_mlp": 0.01044621, "balance_loss_clip": 1.02408099, "balance_loss_mlp": 1.02695906, "epoch": 0.16467758905756802, "flos": 24929172668160.0, "grad_norm": 1.6850865175004341, "language_loss": 0.71940517, "learning_rate": 3.7385625388266037e-06, "loss": 0.7408458, "num_input_tokens_seen": 59232340, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.7265625, "step": 2739, "time_per_iteration": 2.3921971321105957 }, { "auxiliary_loss_clip": 0.01098789, "auxiliary_loss_mlp": 0.01035218, "balance_loss_clip": 1.01532221, "balance_loss_mlp": 1.02643645, "epoch": 0.16473771231023598, "flos": 24825375596160.0, "grad_norm": 3.8569228628265426, "language_loss": 0.81790274, "learning_rate": 3.7383757435084284e-06, "loss": 0.83924282, "num_input_tokens_seen": 59253950, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.72265625, "step": 2740, "time_per_iteration": 3.817305564880371 }, { "auxiliary_loss_clip": 0.01106972, "auxiliary_loss_mlp": 0.01048364, "balance_loss_clip": 1.02619159, "balance_loss_mlp": 1.0297575, "epoch": 0.16479783556290395, "flos": 39894482822400.0, "grad_norm": 2.491280067494279, "language_loss": 0.68863475, "learning_rate": 3.7381888861520943e-06, "loss": 0.71018815, "num_input_tokens_seen": 59275545, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.7734375, "step": 2741, "time_per_iteration": 2.5311717987060547 }, { "auxiliary_loss_clip": 0.01100268, "auxiliary_loss_mlp": 0.01035299, "balance_loss_clip": 1.01497364, "balance_loss_mlp": 1.02653241, "epoch": 0.16485795881557191, "flos": 19896163292160.0, "grad_norm": 1.7078130198013188, "language_loss": 0.79608095, "learning_rate": 3.73800196676427e-06, "loss": 0.8174367, "num_input_tokens_seen": 59293480, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.73828125, "step": 2742, "time_per_iteration": 3.780141592025757 }, { "auxiliary_loss_clip": 0.01099181, "auxiliary_loss_mlp": 0.0104227, "balance_loss_clip": 1.02131319, "balance_loss_mlp": 1.02702117, "epoch": 0.16491808206823988, "flos": 20555148804480.0, "grad_norm": 2.675536099106907, "language_loss": 0.8468293, "learning_rate": 3.737814985351627e-06, "loss": 0.86824381, "num_input_tokens_seen": 59313435, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.72265625, "step": 2743, "time_per_iteration": 3.7769858837127686 }, { "auxiliary_loss_clip": 0.0109797, "auxiliary_loss_mlp": 0.01038289, "balance_loss_clip": 1.01821387, "balance_loss_mlp": 1.02596939, "epoch": 0.16497820532090787, "flos": 23799802141440.0, "grad_norm": 1.614851134462598, "language_loss": 0.85501188, "learning_rate": 3.7376279419208367e-06, "loss": 0.87637448, "num_input_tokens_seen": 59331535, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.71875, "step": 2744, "time_per_iteration": 2.3961873054504395 }, { "auxiliary_loss_clip": 0.0109583, "auxiliary_loss_mlp": 0.0104075, "balance_loss_clip": 1.02158082, "balance_loss_mlp": 1.02610826, "epoch": 0.16503832857357584, "flos": 25481498376960.0, "grad_norm": 2.0251563213959, "language_loss": 0.82605666, "learning_rate": 3.7374408364785744e-06, "loss": 0.84742248, "num_input_tokens_seen": 59350680, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.69921875, "step": 2745, "time_per_iteration": 3.801182508468628 }, { "auxiliary_loss_clip": 0.01105267, "auxiliary_loss_mlp": 0.01046314, "balance_loss_clip": 1.02546382, "balance_loss_mlp": 1.02833867, "epoch": 0.1650984518262438, "flos": 17675093462400.0, "grad_norm": 2.203588025169381, "language_loss": 0.76188481, "learning_rate": 3.7372536690315187e-06, "loss": 0.7834006, "num_input_tokens_seen": 59367020, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.76953125, "step": 2746, "time_per_iteration": 2.367724895477295 }, { "auxiliary_loss_clip": 0.0109833, "auxiliary_loss_mlp": 0.01038929, "balance_loss_clip": 1.01905644, "balance_loss_mlp": 1.02701068, "epoch": 0.16515857507891177, "flos": 18697315426560.0, "grad_norm": 1.5013443128957897, "language_loss": 0.80648381, "learning_rate": 3.737066439586348e-06, "loss": 0.82785642, "num_input_tokens_seen": 59386075, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.7109375, "step": 2747, "time_per_iteration": 2.379795789718628 }, { "auxiliary_loss_clip": 0.01103344, "auxiliary_loss_mlp": 0.01039918, "balance_loss_clip": 1.01911581, "balance_loss_mlp": 1.02899504, "epoch": 0.16521869833157973, "flos": 15009649447680.0, "grad_norm": 2.0025581701827586, "language_loss": 0.69230592, "learning_rate": 3.7368791481497448e-06, "loss": 0.71373856, "num_input_tokens_seen": 59402690, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.7421875, "step": 2748, "time_per_iteration": 2.341280937194824 }, { "auxiliary_loss_clip": 0.01101063, "auxiliary_loss_mlp": 0.01046647, "balance_loss_clip": 1.02620232, "balance_loss_mlp": 1.02744234, "epoch": 0.1652788215842477, "flos": 22120235498880.0, "grad_norm": 2.13358667618576, "language_loss": 0.87971032, "learning_rate": 3.736691794728392e-06, "loss": 0.90118742, "num_input_tokens_seen": 59421130, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.734375, "step": 2749, "time_per_iteration": 2.374248504638672 }, { "auxiliary_loss_clip": 0.01100285, "auxiliary_loss_mlp": 0.01034894, "balance_loss_clip": 1.01435399, "balance_loss_mlp": 1.0265491, "epoch": 0.16533894483691566, "flos": 18332089027200.0, "grad_norm": 1.9698727226091124, "language_loss": 0.79004288, "learning_rate": 3.736504379328976e-06, "loss": 0.81139457, "num_input_tokens_seen": 59438970, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.73828125, "step": 2750, "time_per_iteration": 2.3773529529571533 }, { "auxiliary_loss_clip": 0.01100886, "auxiliary_loss_mlp": 0.01040034, "balance_loss_clip": 1.019876, "balance_loss_mlp": 1.02821505, "epoch": 0.16539906808958366, "flos": 22381036421760.0, "grad_norm": 1.679330557616043, "language_loss": 0.95238423, "learning_rate": 3.7363169019581865e-06, "loss": 0.97379339, "num_input_tokens_seen": 59458510, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.7265625, "step": 2751, "time_per_iteration": 2.3926827907562256 }, { "auxiliary_loss_clip": 0.01098458, "auxiliary_loss_mlp": 0.010343, "balance_loss_clip": 1.01447558, "balance_loss_mlp": 1.02918196, "epoch": 0.16545919134225162, "flos": 22709988051840.0, "grad_norm": 3.6291009142097597, "language_loss": 0.70971817, "learning_rate": 3.7361293626227125e-06, "loss": 0.73104578, "num_input_tokens_seen": 59477110, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.69140625, "step": 2752, "time_per_iteration": 2.4085581302642822 }, { "auxiliary_loss_clip": 0.01029316, "auxiliary_loss_mlp": 0.01008623, "balance_loss_clip": 1.00564301, "balance_loss_mlp": 1.00707996, "epoch": 0.1655193145949196, "flos": 67799720805120.0, "grad_norm": 0.8051340800737071, "language_loss": 0.54032564, "learning_rate": 3.735941761329248e-06, "loss": 0.56070507, "num_input_tokens_seen": 59541155, "router_z_loss_clip": 0.02978516, "router_z_loss_mlp": 0.22265625, "step": 2753, "time_per_iteration": 3.1193695068359375 }, { "auxiliary_loss_clip": 0.01099621, "auxiliary_loss_mlp": 0.01033631, "balance_loss_clip": 1.01318693, "balance_loss_mlp": 1.02745223, "epoch": 0.16557943784758755, "flos": 24279229198080.0, "grad_norm": 1.8369958358468492, "language_loss": 0.75099742, "learning_rate": 3.735754098084487e-06, "loss": 0.77232993, "num_input_tokens_seen": 59561155, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.71875, "step": 2754, "time_per_iteration": 2.4263463020324707 }, { "auxiliary_loss_clip": 0.01108749, "auxiliary_loss_mlp": 0.01046593, "balance_loss_clip": 1.02282274, "balance_loss_mlp": 1.03078508, "epoch": 0.16563956110025552, "flos": 20082599285760.0, "grad_norm": 2.722434912623219, "language_loss": 0.86311758, "learning_rate": 3.7355663728951265e-06, "loss": 0.88467097, "num_input_tokens_seen": 59580460, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 0.77734375, "step": 2755, "time_per_iteration": 2.3921918869018555 }, { "auxiliary_loss_clip": 0.01098502, "auxiliary_loss_mlp": 0.01042226, "balance_loss_clip": 1.02297413, "balance_loss_mlp": 1.02710485, "epoch": 0.16569968435292348, "flos": 28033300316160.0, "grad_norm": 1.9668023918212456, "language_loss": 0.73244894, "learning_rate": 3.7353785857678675e-06, "loss": 0.75385618, "num_input_tokens_seen": 59600025, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.71484375, "step": 2756, "time_per_iteration": 2.442095994949341 }, { "auxiliary_loss_clip": 0.01097588, "auxiliary_loss_mlp": 0.01038453, "balance_loss_clip": 1.01904607, "balance_loss_mlp": 1.02879632, "epoch": 0.16575980760559147, "flos": 26249028906240.0, "grad_norm": 1.7749553241589369, "language_loss": 0.74760187, "learning_rate": 3.7351907367094105e-06, "loss": 0.76896232, "num_input_tokens_seen": 59620600, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.6875, "step": 2757, "time_per_iteration": 2.4306015968322754 }, { "auxiliary_loss_clip": 0.01101508, "auxiliary_loss_mlp": 0.01037136, "balance_loss_clip": 1.01654887, "balance_loss_mlp": 1.02925587, "epoch": 0.16581993085825944, "flos": 26942718176640.0, "grad_norm": 2.1596303075322982, "language_loss": 0.84663153, "learning_rate": 3.7350028257264593e-06, "loss": 0.86801791, "num_input_tokens_seen": 59641385, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.72265625, "step": 2758, "time_per_iteration": 2.438163995742798 }, { "auxiliary_loss_clip": 0.01104018, "auxiliary_loss_mlp": 0.01039387, "balance_loss_clip": 1.0203855, "balance_loss_mlp": 1.03096461, "epoch": 0.1658800541109274, "flos": 21652538659200.0, "grad_norm": 1.886810125326837, "language_loss": 0.79101157, "learning_rate": 3.7348148528257202e-06, "loss": 0.81244564, "num_input_tokens_seen": 59659865, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.73046875, "step": 2759, "time_per_iteration": 2.3999645709991455 }, { "auxiliary_loss_clip": 0.0109939, "auxiliary_loss_mlp": 0.01037362, "balance_loss_clip": 1.01654792, "balance_loss_mlp": 1.02732992, "epoch": 0.16594017736359537, "flos": 16434559566720.0, "grad_norm": 2.2047309594012026, "language_loss": 0.75204885, "learning_rate": 3.734626818013902e-06, "loss": 0.77341634, "num_input_tokens_seen": 59678780, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.72265625, "step": 2760, "time_per_iteration": 2.3776280879974365 }, { "auxiliary_loss_clip": 0.01104055, "auxiliary_loss_mlp": 0.0103902, "balance_loss_clip": 1.01839638, "balance_loss_mlp": 1.0285697, "epoch": 0.16600030061626334, "flos": 22636216615680.0, "grad_norm": 2.606869656949303, "language_loss": 0.73423386, "learning_rate": 3.734438721297714e-06, "loss": 0.75566459, "num_input_tokens_seen": 59698795, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.75390625, "step": 2761, "time_per_iteration": 2.4225025177001953 }, { "auxiliary_loss_clip": 0.01099037, "auxiliary_loss_mlp": 0.01040729, "balance_loss_clip": 1.02139318, "balance_loss_mlp": 1.02757204, "epoch": 0.1660604238689313, "flos": 26395349880960.0, "grad_norm": 3.788373493784275, "language_loss": 0.8883667, "learning_rate": 3.73425056268387e-06, "loss": 0.90976429, "num_input_tokens_seen": 59718795, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.71484375, "step": 2762, "time_per_iteration": 2.479235887527466 }, { "auxiliary_loss_clip": 0.01101874, "auxiliary_loss_mlp": 0.01041349, "balance_loss_clip": 1.02078533, "balance_loss_mlp": 1.02852178, "epoch": 0.16612054712159927, "flos": 23038869859200.0, "grad_norm": 2.36889733150707, "language_loss": 0.8771072, "learning_rate": 3.7340623421790843e-06, "loss": 0.89853942, "num_input_tokens_seen": 59737555, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.734375, "step": 2763, "time_per_iteration": 2.483316421508789 }, { "auxiliary_loss_clip": 0.0102727, "auxiliary_loss_mlp": 0.01003247, "balance_loss_clip": 1.00039816, "balance_loss_mlp": 1.0044899, "epoch": 0.16618067037426726, "flos": 59237864691840.0, "grad_norm": 0.7714551426644473, "language_loss": 0.59774059, "learning_rate": 3.733874059790074e-06, "loss": 0.61804575, "num_input_tokens_seen": 59800915, "router_z_loss_clip": 0.02844238, "router_z_loss_mlp": 0.22753906, "step": 2764, "time_per_iteration": 3.048445224761963 }, { "auxiliary_loss_clip": 0.01105783, "auxiliary_loss_mlp": 0.01040468, "balance_loss_clip": 1.01886737, "balance_loss_mlp": 1.0312283, "epoch": 0.16624079362693522, "flos": 27197584168320.0, "grad_norm": 1.7848183524532986, "language_loss": 0.82096636, "learning_rate": 3.733685715523559e-06, "loss": 0.84242886, "num_input_tokens_seen": 59822910, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.74609375, "step": 2765, "time_per_iteration": 2.4442572593688965 }, { "auxiliary_loss_clip": 0.01107709, "auxiliary_loss_mlp": 0.010432, "balance_loss_clip": 1.01971591, "balance_loss_mlp": 1.02863848, "epoch": 0.1663009168796032, "flos": 10924322549760.0, "grad_norm": 2.682801917681024, "language_loss": 0.69734764, "learning_rate": 3.7334973093862595e-06, "loss": 0.71885675, "num_input_tokens_seen": 59838805, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 0.7890625, "step": 2766, "time_per_iteration": 2.3364741802215576 }, { "auxiliary_loss_clip": 0.01100791, "auxiliary_loss_mlp": 0.01040526, "balance_loss_clip": 1.02058244, "balance_loss_mlp": 1.03000283, "epoch": 0.16636104013227115, "flos": 17893475216640.0, "grad_norm": 2.7257714940608744, "language_loss": 0.88355601, "learning_rate": 3.7333088413849008e-06, "loss": 0.9049691, "num_input_tokens_seen": 59855345, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.70703125, "step": 2767, "time_per_iteration": 2.365118980407715 }, { "auxiliary_loss_clip": 0.01026081, "auxiliary_loss_mlp": 0.01000866, "balance_loss_clip": 0.99827963, "balance_loss_mlp": 1.00439978, "epoch": 0.16642116338493912, "flos": 66722335159680.0, "grad_norm": 0.6389215842058035, "language_loss": 0.52877498, "learning_rate": 3.7331203115262078e-06, "loss": 0.54904449, "num_input_tokens_seen": 59917710, "router_z_loss_clip": 0.02587891, "router_z_loss_mlp": 0.21679688, "step": 2768, "time_per_iteration": 3.1787824630737305 }, { "auxiliary_loss_clip": 0.01104612, "auxiliary_loss_mlp": 0.01035194, "balance_loss_clip": 1.01367664, "balance_loss_mlp": 1.02860165, "epoch": 0.16648128663760708, "flos": 19025045159040.0, "grad_norm": 2.534206350664639, "language_loss": 0.85295093, "learning_rate": 3.7329317198169098e-06, "loss": 0.87434894, "num_input_tokens_seen": 59935105, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.7578125, "step": 2769, "time_per_iteration": 2.354337692260742 }, { "auxiliary_loss_clip": 0.01025435, "auxiliary_loss_mlp": 0.01002373, "balance_loss_clip": 0.99979842, "balance_loss_mlp": 1.00393605, "epoch": 0.16654140989027508, "flos": 70131744535680.0, "grad_norm": 0.806276761124177, "language_loss": 0.57446808, "learning_rate": 3.732743066263736e-06, "loss": 0.59474611, "num_input_tokens_seen": 59984085, "router_z_loss_clip": 0.02575684, "router_z_loss_mlp": 0.21484375, "step": 2770, "time_per_iteration": 2.9205851554870605 }, { "auxiliary_loss_clip": 0.01024973, "auxiliary_loss_mlp": 0.01004411, "balance_loss_clip": 1.0017761, "balance_loss_mlp": 1.00355017, "epoch": 0.16660153314294304, "flos": 70269407493120.0, "grad_norm": 0.8895709965521077, "language_loss": 0.56245881, "learning_rate": 3.7325543508734187e-06, "loss": 0.58275265, "num_input_tokens_seen": 60043470, "router_z_loss_clip": 0.02636719, "router_z_loss_mlp": 0.21484375, "step": 2771, "time_per_iteration": 2.8889243602752686 }, { "auxiliary_loss_clip": 0.01101051, "auxiliary_loss_mlp": 0.0104363, "balance_loss_clip": 1.02268481, "balance_loss_mlp": 1.02889836, "epoch": 0.166661656395611, "flos": 23073957642240.0, "grad_norm": 3.431203791623101, "language_loss": 0.70461863, "learning_rate": 3.732365573652694e-06, "loss": 0.7260654, "num_input_tokens_seen": 60063045, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.72265625, "step": 2772, "time_per_iteration": 2.4362192153930664 }, { "auxiliary_loss_clip": 0.01099417, "auxiliary_loss_mlp": 0.01039775, "balance_loss_clip": 1.01942599, "balance_loss_mlp": 1.02762997, "epoch": 0.16672177964827897, "flos": 28365079766400.0, "grad_norm": 3.772922793979306, "language_loss": 0.86091107, "learning_rate": 3.7321767346082977e-06, "loss": 0.882303, "num_input_tokens_seen": 60081945, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.71875, "step": 2773, "time_per_iteration": 2.4295010566711426 }, { "auxiliary_loss_clip": 0.01099592, "auxiliary_loss_mlp": 0.01030765, "balance_loss_clip": 1.01282382, "balance_loss_mlp": 1.02890277, "epoch": 0.16678190290094694, "flos": 19090228400640.0, "grad_norm": 2.2753054762817917, "language_loss": 0.82221007, "learning_rate": 3.7319878337469694e-06, "loss": 0.84351367, "num_input_tokens_seen": 60096820, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.70703125, "step": 2774, "time_per_iteration": 2.39690899848938 }, { "auxiliary_loss_clip": 0.01102119, "auxiliary_loss_mlp": 0.01040889, "balance_loss_clip": 1.02076638, "balance_loss_mlp": 1.02918148, "epoch": 0.1668420261536149, "flos": 21798021761280.0, "grad_norm": 2.396825643784463, "language_loss": 0.8285411, "learning_rate": 3.73179887107545e-06, "loss": 0.84997118, "num_input_tokens_seen": 60116140, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.7265625, "step": 2775, "time_per_iteration": 2.388744354248047 }, { "auxiliary_loss_clip": 0.01099687, "auxiliary_loss_mlp": 0.01039013, "balance_loss_clip": 1.02028513, "balance_loss_mlp": 1.02941787, "epoch": 0.16690214940628287, "flos": 19061529396480.0, "grad_norm": 3.126882021668333, "language_loss": 0.806099, "learning_rate": 3.731609846600485e-06, "loss": 0.82748598, "num_input_tokens_seen": 60134235, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.703125, "step": 2776, "time_per_iteration": 2.386744976043701 }, { "auxiliary_loss_clip": 0.01095309, "auxiliary_loss_mlp": 0.01035522, "balance_loss_clip": 1.0168184, "balance_loss_mlp": 1.02750969, "epoch": 0.16696227265895086, "flos": 18587548512000.0, "grad_norm": 2.1056546406434435, "language_loss": 0.80016923, "learning_rate": 3.731420760328818e-06, "loss": 0.82147753, "num_input_tokens_seen": 60153275, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.67578125, "step": 2777, "time_per_iteration": 2.3581535816192627 }, { "auxiliary_loss_clip": 0.01100116, "auxiliary_loss_mlp": 0.01037908, "balance_loss_clip": 1.01821482, "balance_loss_mlp": 1.02836227, "epoch": 0.16702239591161883, "flos": 23293037623680.0, "grad_norm": 1.775130683428226, "language_loss": 0.85230374, "learning_rate": 3.7312316122671977e-06, "loss": 0.87368405, "num_input_tokens_seen": 60173215, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.71875, "step": 2778, "time_per_iteration": 2.4324114322662354 }, { "auxiliary_loss_clip": 0.01103116, "auxiliary_loss_mlp": 0.01036977, "balance_loss_clip": 1.01637769, "balance_loss_mlp": 1.02825439, "epoch": 0.1670825191642868, "flos": 24424502832000.0, "grad_norm": 2.077255176239374, "language_loss": 0.74045932, "learning_rate": 3.731042402422375e-06, "loss": 0.76186025, "num_input_tokens_seen": 60190515, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.75, "step": 2779, "time_per_iteration": 3.8116507530212402 }, { "auxiliary_loss_clip": 0.01099015, "auxiliary_loss_mlp": 0.01040698, "balance_loss_clip": 1.0210402, "balance_loss_mlp": 1.02855587, "epoch": 0.16714264241695476, "flos": 26796292467840.0, "grad_norm": 3.883328641370763, "language_loss": 0.66294205, "learning_rate": 3.730853130801101e-06, "loss": 0.68433917, "num_input_tokens_seen": 60211655, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.703125, "step": 2780, "time_per_iteration": 2.469529628753662 }, { "auxiliary_loss_clip": 0.01098358, "auxiliary_loss_mlp": 0.01036649, "balance_loss_clip": 1.01637185, "balance_loss_mlp": 1.02710223, "epoch": 0.16720276566962272, "flos": 21834226707840.0, "grad_norm": 2.3808420385732205, "language_loss": 0.78112018, "learning_rate": 3.7306637974101312e-06, "loss": 0.80247027, "num_input_tokens_seen": 60230860, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.7109375, "step": 2781, "time_per_iteration": 3.8163065910339355 }, { "auxiliary_loss_clip": 0.01102115, "auxiliary_loss_mlp": 0.01033987, "balance_loss_clip": 1.01564074, "balance_loss_mlp": 1.02904999, "epoch": 0.1672628889222907, "flos": 21469349422080.0, "grad_norm": 1.7121057808521025, "language_loss": 0.74994546, "learning_rate": 3.730474402256223e-06, "loss": 0.77130646, "num_input_tokens_seen": 60250535, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.73046875, "step": 2782, "time_per_iteration": 3.775129556655884 }, { "auxiliary_loss_clip": 0.01103694, "auxiliary_loss_mlp": 0.01036492, "balance_loss_clip": 1.01619673, "balance_loss_mlp": 1.02874088, "epoch": 0.16732301217495865, "flos": 30772690323840.0, "grad_norm": 4.691143884560107, "language_loss": 0.67676735, "learning_rate": 3.7302849453461337e-06, "loss": 0.69816923, "num_input_tokens_seen": 60269530, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.75, "step": 2783, "time_per_iteration": 2.4699010848999023 }, { "auxiliary_loss_clip": 0.01101268, "auxiliary_loss_mlp": 0.01038573, "balance_loss_clip": 1.01932085, "balance_loss_mlp": 1.02963662, "epoch": 0.16738313542762664, "flos": 23473573597440.0, "grad_norm": 1.7539523891610789, "language_loss": 0.70496118, "learning_rate": 3.730095426686626e-06, "loss": 0.72635961, "num_input_tokens_seen": 60289900, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.71875, "step": 2784, "time_per_iteration": 3.8684651851654053 }, { "auxiliary_loss_clip": 0.01102211, "auxiliary_loss_mlp": 0.01042852, "balance_loss_clip": 1.02014303, "balance_loss_mlp": 1.02739811, "epoch": 0.1674432586802946, "flos": 29787790469760.0, "grad_norm": 2.05039647654335, "language_loss": 0.60617006, "learning_rate": 3.729905846284463e-06, "loss": 0.62762076, "num_input_tokens_seen": 60310025, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.75, "step": 2785, "time_per_iteration": 2.4373538494110107 }, { "auxiliary_loss_clip": 0.01025677, "auxiliary_loss_mlp": 0.01008213, "balance_loss_clip": 1.00566173, "balance_loss_mlp": 1.00451803, "epoch": 0.16750338193296258, "flos": 66132547695360.0, "grad_norm": 0.8260912271145021, "language_loss": 0.58771896, "learning_rate": 3.72971620414641e-06, "loss": 0.60805786, "num_input_tokens_seen": 60377800, "router_z_loss_clip": 0.0255127, "router_z_loss_mlp": 0.2109375, "step": 2786, "time_per_iteration": 3.076218605041504 }, { "auxiliary_loss_clip": 0.01101877, "auxiliary_loss_mlp": 0.01039603, "balance_loss_clip": 1.01880097, "balance_loss_mlp": 1.02818251, "epoch": 0.16756350518563054, "flos": 25695760590720.0, "grad_norm": 1.9732594496456566, "language_loss": 0.76632226, "learning_rate": 3.729526500279235e-06, "loss": 0.78773701, "num_input_tokens_seen": 60398215, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.73828125, "step": 2787, "time_per_iteration": 2.409895896911621 }, { "auxiliary_loss_clip": 0.01101295, "auxiliary_loss_mlp": 0.01038415, "balance_loss_clip": 1.0182445, "balance_loss_mlp": 1.02843809, "epoch": 0.1676236284382985, "flos": 23835134304000.0, "grad_norm": 2.4665047237238906, "language_loss": 0.76906705, "learning_rate": 3.729336734689708e-06, "loss": 0.7904641, "num_input_tokens_seen": 60416910, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.73046875, "step": 2788, "time_per_iteration": 2.4283406734466553 }, { "auxiliary_loss_clip": 0.01023516, "auxiliary_loss_mlp": 0.01007237, "balance_loss_clip": 1.00474536, "balance_loss_mlp": 1.00275159, "epoch": 0.16768375169096647, "flos": 59872167872640.0, "grad_norm": 0.8522369178846958, "language_loss": 0.59424734, "learning_rate": 3.7291469073846017e-06, "loss": 0.61455488, "num_input_tokens_seen": 60468660, "router_z_loss_clip": 0.02490234, "router_z_loss_mlp": 0.20800781, "step": 2789, "time_per_iteration": 2.8966400623321533 }, { "auxiliary_loss_clip": 0.01102404, "auxiliary_loss_mlp": 0.0104348, "balance_loss_clip": 1.02137899, "balance_loss_mlp": 1.02846503, "epoch": 0.16774387494363446, "flos": 38434135806720.0, "grad_norm": 1.596248254233782, "language_loss": 0.69839656, "learning_rate": 3.72895701837069e-06, "loss": 0.71985543, "num_input_tokens_seen": 60492370, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.73828125, "step": 2790, "time_per_iteration": 2.550748586654663 }, { "auxiliary_loss_clip": 0.01102681, "auxiliary_loss_mlp": 0.01043039, "balance_loss_clip": 1.02257061, "balance_loss_mlp": 1.0283494, "epoch": 0.16780399819630243, "flos": 22636530817920.0, "grad_norm": 1.8564311737949704, "language_loss": 0.79571879, "learning_rate": 3.7287670676547495e-06, "loss": 0.81717592, "num_input_tokens_seen": 60512655, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.74609375, "step": 2791, "time_per_iteration": 2.3864128589630127 }, { "auxiliary_loss_clip": 0.0110316, "auxiliary_loss_mlp": 0.01046247, "balance_loss_clip": 1.02599347, "balance_loss_mlp": 1.02913141, "epoch": 0.1678641214489704, "flos": 32890102727040.0, "grad_norm": 2.08072886880986, "language_loss": 0.71467054, "learning_rate": 3.7285770552435593e-06, "loss": 0.73616463, "num_input_tokens_seen": 60533090, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.7421875, "step": 2792, "time_per_iteration": 2.4916250705718994 }, { "auxiliary_loss_clip": 0.01102263, "auxiliary_loss_mlp": 0.01038455, "balance_loss_clip": 1.0188446, "balance_loss_mlp": 1.02897751, "epoch": 0.16792424470163836, "flos": 19973879712000.0, "grad_norm": 1.916949862508531, "language_loss": 0.71492851, "learning_rate": 3.7283869811439006e-06, "loss": 0.73633564, "num_input_tokens_seen": 60553190, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.734375, "step": 2793, "time_per_iteration": 2.388706922531128 }, { "auxiliary_loss_clip": 0.01103298, "auxiliary_loss_mlp": 0.01038733, "balance_loss_clip": 1.01902735, "balance_loss_mlp": 1.02974629, "epoch": 0.16798436795430632, "flos": 19718839163520.0, "grad_norm": 2.0882395204511353, "language_loss": 0.7694692, "learning_rate": 3.728196845362557e-06, "loss": 0.7908895, "num_input_tokens_seen": 60571995, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.734375, "step": 2794, "time_per_iteration": 2.3973076343536377 }, { "auxiliary_loss_clip": 0.01104216, "auxiliary_loss_mlp": 0.01042443, "balance_loss_clip": 1.02249885, "balance_loss_mlp": 1.03091669, "epoch": 0.1680444912069743, "flos": 28103755173120.0, "grad_norm": 3.660215334389666, "language_loss": 0.7173906, "learning_rate": 3.7280066479063128e-06, "loss": 0.73885721, "num_input_tokens_seen": 60591275, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.73046875, "step": 2795, "time_per_iteration": 2.4465954303741455 }, { "auxiliary_loss_clip": 0.01100168, "auxiliary_loss_mlp": 0.01029214, "balance_loss_clip": 1.00934196, "balance_loss_mlp": 1.02792013, "epoch": 0.16810461445964225, "flos": 18074290481280.0, "grad_norm": 2.0025951283747716, "language_loss": 0.83917654, "learning_rate": 3.7278163887819565e-06, "loss": 0.86047041, "num_input_tokens_seen": 60609235, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.72265625, "step": 2796, "time_per_iteration": 2.364109992980957 }, { "auxiliary_loss_clip": 0.01101342, "auxiliary_loss_mlp": 0.01040593, "balance_loss_clip": 1.02031505, "balance_loss_mlp": 1.02778924, "epoch": 0.16816473771231025, "flos": 23877518561280.0, "grad_norm": 2.6026955903410967, "language_loss": 0.81673908, "learning_rate": 3.727626067996277e-06, "loss": 0.83815849, "num_input_tokens_seen": 60629880, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.734375, "step": 2797, "time_per_iteration": 2.405458450317383 }, { "auxiliary_loss_clip": 0.01095684, "auxiliary_loss_mlp": 0.01032874, "balance_loss_clip": 1.01563668, "balance_loss_mlp": 1.02797866, "epoch": 0.1682248609649782, "flos": 22782502679040.0, "grad_norm": 1.5458723910185148, "language_loss": 0.75072479, "learning_rate": 3.727435685556068e-06, "loss": 0.77201039, "num_input_tokens_seen": 60651175, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.67578125, "step": 2798, "time_per_iteration": 2.412954330444336 }, { "auxiliary_loss_clip": 0.01102921, "auxiliary_loss_mlp": 0.01039167, "balance_loss_clip": 1.0206418, "balance_loss_mlp": 1.03008187, "epoch": 0.16828498421764618, "flos": 20704053219840.0, "grad_norm": 2.087515366307674, "language_loss": 0.79870963, "learning_rate": 3.7272452414681227e-06, "loss": 0.82013059, "num_input_tokens_seen": 60670210, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.7265625, "step": 2799, "time_per_iteration": 2.3892929553985596 }, { "auxiliary_loss_clip": 0.01103858, "auxiliary_loss_mlp": 0.0103511, "balance_loss_clip": 1.01366425, "balance_loss_mlp": 1.02899194, "epoch": 0.16834510747031414, "flos": 29419422048000.0, "grad_norm": 2.1088789988159067, "language_loss": 0.70523083, "learning_rate": 3.7270547357392375e-06, "loss": 0.72662044, "num_input_tokens_seen": 60690895, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.75, "step": 2800, "time_per_iteration": 2.461484909057617 }, { "auxiliary_loss_clip": 0.01099733, "auxiliary_loss_mlp": 0.01035997, "balance_loss_clip": 1.01558852, "balance_loss_mlp": 1.02740645, "epoch": 0.1684052307229821, "flos": 18144535870080.0, "grad_norm": 1.724561753876366, "language_loss": 0.83576268, "learning_rate": 3.7268641683762113e-06, "loss": 0.85712004, "num_input_tokens_seen": 60708280, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.72265625, "step": 2801, "time_per_iteration": 2.348421335220337 }, { "auxiliary_loss_clip": 0.01101752, "auxiliary_loss_mlp": 0.01041094, "balance_loss_clip": 1.02017236, "balance_loss_mlp": 1.02821028, "epoch": 0.16846535397565007, "flos": 16574177560320.0, "grad_norm": 2.7224930861772654, "language_loss": 0.82470471, "learning_rate": 3.7266735393858456e-06, "loss": 0.84613317, "num_input_tokens_seen": 60724150, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.734375, "step": 2802, "time_per_iteration": 2.3624260425567627 }, { "auxiliary_loss_clip": 0.01102921, "auxiliary_loss_mlp": 0.01043189, "balance_loss_clip": 1.02175534, "balance_loss_mlp": 1.02774191, "epoch": 0.16852547722831807, "flos": 30407568658560.0, "grad_norm": 1.5945284759246205, "language_loss": 0.80595237, "learning_rate": 3.7264828487749422e-06, "loss": 0.82741344, "num_input_tokens_seen": 60746485, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.75390625, "step": 2803, "time_per_iteration": 2.461865186691284 }, { "auxiliary_loss_clip": 0.01099747, "auxiliary_loss_mlp": 0.01042445, "balance_loss_clip": 1.0223465, "balance_loss_mlp": 1.02886486, "epoch": 0.16858560048098603, "flos": 33506110488960.0, "grad_norm": 2.253990294914759, "language_loss": 0.76085579, "learning_rate": 3.726292096550307e-06, "loss": 0.78227776, "num_input_tokens_seen": 60762875, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.70703125, "step": 2804, "time_per_iteration": 2.4685802459716797 }, { "auxiliary_loss_clip": 0.01021925, "auxiliary_loss_mlp": 0.0100264, "balance_loss_clip": 1.00030327, "balance_loss_mlp": 1.00205374, "epoch": 0.168645723733654, "flos": 67367111748480.0, "grad_norm": 0.8313708537078875, "language_loss": 0.55405569, "learning_rate": 3.7261012827187477e-06, "loss": 0.57430136, "num_input_tokens_seen": 60825510, "router_z_loss_clip": 0.02331543, "router_z_loss_mlp": 0.19921875, "step": 2805, "time_per_iteration": 2.996175765991211 }, { "auxiliary_loss_clip": 0.01095749, "auxiliary_loss_mlp": 0.01034293, "balance_loss_clip": 1.01666236, "balance_loss_mlp": 1.02667689, "epoch": 0.16870584698632196, "flos": 21323552117760.0, "grad_norm": 2.471730494349587, "language_loss": 0.72939378, "learning_rate": 3.725910407287074e-06, "loss": 0.75069416, "num_input_tokens_seen": 60844440, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.69140625, "step": 2806, "time_per_iteration": 2.394117593765259 }, { "auxiliary_loss_clip": 0.01098756, "auxiliary_loss_mlp": 0.01038943, "balance_loss_clip": 1.02041769, "balance_loss_mlp": 1.02883208, "epoch": 0.16876597023898993, "flos": 20739699584640.0, "grad_norm": 2.0119891708960393, "language_loss": 0.70036387, "learning_rate": 3.7257194702620964e-06, "loss": 0.72174084, "num_input_tokens_seen": 60863210, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.69921875, "step": 2807, "time_per_iteration": 2.383373498916626 }, { "auxiliary_loss_clip": 0.01101881, "auxiliary_loss_mlp": 0.01041627, "balance_loss_clip": 1.02217221, "balance_loss_mlp": 1.03004301, "epoch": 0.1688260934916579, "flos": 20302447317120.0, "grad_norm": 2.4472358434644166, "language_loss": 0.70172656, "learning_rate": 3.725528471650631e-06, "loss": 0.72316158, "num_input_tokens_seen": 60882510, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.71875, "step": 2808, "time_per_iteration": 2.3989601135253906 }, { "auxiliary_loss_clip": 0.01100398, "auxiliary_loss_mlp": 0.0104131, "balance_loss_clip": 1.02078247, "balance_loss_mlp": 1.02708447, "epoch": 0.16888621674432586, "flos": 20339629781760.0, "grad_norm": 2.292779589312388, "language_loss": 0.80167681, "learning_rate": 3.7253374114594925e-06, "loss": 0.82309389, "num_input_tokens_seen": 60901105, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.734375, "step": 2809, "time_per_iteration": 2.369114398956299 }, { "auxiliary_loss_clip": 0.01109348, "auxiliary_loss_mlp": 0.0104127, "balance_loss_clip": 1.02125466, "balance_loss_mlp": 1.03182256, "epoch": 0.16894633999699385, "flos": 16244108766720.0, "grad_norm": 2.8977700555594548, "language_loss": 0.8793937, "learning_rate": 3.7251462896955e-06, "loss": 0.90089989, "num_input_tokens_seen": 60915340, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.7734375, "step": 2810, "time_per_iteration": 2.3441340923309326 }, { "auxiliary_loss_clip": 0.01103802, "auxiliary_loss_mlp": 0.01047031, "balance_loss_clip": 1.02674174, "balance_loss_mlp": 1.02955842, "epoch": 0.16900646324966181, "flos": 19609142071680.0, "grad_norm": 2.502849501120339, "language_loss": 0.92641753, "learning_rate": 3.724955106365474e-06, "loss": 0.94792581, "num_input_tokens_seen": 60933735, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.7421875, "step": 2811, "time_per_iteration": 2.377276659011841 }, { "auxiliary_loss_clip": 0.01100842, "auxiliary_loss_mlp": 0.01038255, "balance_loss_clip": 1.01937222, "balance_loss_mlp": 1.02856755, "epoch": 0.16906658650232978, "flos": 22016997008640.0, "grad_norm": 2.3749282258990276, "language_loss": 0.78287768, "learning_rate": 3.724763861476237e-06, "loss": 0.80426866, "num_input_tokens_seen": 60953105, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.72265625, "step": 2812, "time_per_iteration": 2.377631902694702 }, { "auxiliary_loss_clip": 0.01100864, "auxiliary_loss_mlp": 0.01040717, "balance_loss_clip": 1.02241826, "balance_loss_mlp": 1.02951097, "epoch": 0.16912670975499774, "flos": 11762936340480.0, "grad_norm": 2.81241792316052, "language_loss": 0.7505877, "learning_rate": 3.724572555034615e-06, "loss": 0.77200353, "num_input_tokens_seen": 60969150, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.71484375, "step": 2813, "time_per_iteration": 2.3591530323028564 }, { "auxiliary_loss_clip": 0.01101558, "auxiliary_loss_mlp": 0.01041879, "balance_loss_clip": 1.02145851, "balance_loss_mlp": 1.02710545, "epoch": 0.1691868330076657, "flos": 17160543711360.0, "grad_norm": 8.40670518179425, "language_loss": 0.68826377, "learning_rate": 3.7243811870474346e-06, "loss": 0.70969814, "num_input_tokens_seen": 60982825, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.7421875, "step": 2814, "time_per_iteration": 2.324979066848755 }, { "auxiliary_loss_clip": 0.01100765, "auxiliary_loss_mlp": 0.01037974, "balance_loss_clip": 1.01856661, "balance_loss_mlp": 1.02842844, "epoch": 0.16924695626033368, "flos": 22415530711680.0, "grad_norm": 2.2746765096551487, "language_loss": 0.61625373, "learning_rate": 3.724189757521525e-06, "loss": 0.63764107, "num_input_tokens_seen": 61000875, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.72265625, "step": 2815, "time_per_iteration": 2.3952457904815674 }, { "auxiliary_loss_clip": 0.01097071, "auxiliary_loss_mlp": 0.01035948, "balance_loss_clip": 1.01794755, "balance_loss_mlp": 1.02694273, "epoch": 0.16930707951300164, "flos": 25738459050240.0, "grad_norm": 3.3403623940253144, "language_loss": 0.82395369, "learning_rate": 3.7239982664637185e-06, "loss": 0.84528393, "num_input_tokens_seen": 61021940, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.703125, "step": 2816, "time_per_iteration": 2.4072141647338867 }, { "auxiliary_loss_clip": 0.01102119, "auxiliary_loss_mlp": 0.01041241, "balance_loss_clip": 1.02181005, "balance_loss_mlp": 1.02807236, "epoch": 0.16936720276566963, "flos": 22745948618880.0, "grad_norm": 3.3116096120669414, "language_loss": 0.86611402, "learning_rate": 3.7238067138808477e-06, "loss": 0.88754761, "num_input_tokens_seen": 61040285, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.7421875, "step": 2817, "time_per_iteration": 2.388659715652466 }, { "auxiliary_loss_clip": 0.01100093, "auxiliary_loss_mlp": 0.01039138, "balance_loss_clip": 1.01989794, "balance_loss_mlp": 1.03019714, "epoch": 0.1694273260183376, "flos": 19572937125120.0, "grad_norm": 1.7000006864629023, "language_loss": 0.8144446, "learning_rate": 3.72361509977975e-06, "loss": 0.83583695, "num_input_tokens_seen": 61059020, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.69921875, "step": 2818, "time_per_iteration": 2.3735878467559814 }, { "auxiliary_loss_clip": 0.01097576, "auxiliary_loss_mlp": 0.01040857, "balance_loss_clip": 1.02055573, "balance_loss_mlp": 1.02641368, "epoch": 0.16948744927100556, "flos": 12457044547200.0, "grad_norm": 2.4156942861080433, "language_loss": 0.82009411, "learning_rate": 3.7234234241672632e-06, "loss": 0.84147841, "num_input_tokens_seen": 61074245, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.7109375, "step": 2819, "time_per_iteration": 3.769258737564087 }, { "auxiliary_loss_clip": 0.01023284, "auxiliary_loss_mlp": 0.01007441, "balance_loss_clip": 1.00496185, "balance_loss_mlp": 1.00381994, "epoch": 0.16954757252367353, "flos": 71288731814400.0, "grad_norm": 0.9319188129893082, "language_loss": 0.61062413, "learning_rate": 3.7232316870502274e-06, "loss": 0.63093144, "num_input_tokens_seen": 61127080, "router_z_loss_clip": 0.02478027, "router_z_loss_mlp": 0.1953125, "step": 2820, "time_per_iteration": 2.91005539894104 }, { "auxiliary_loss_clip": 0.01099981, "auxiliary_loss_mlp": 0.01043868, "balance_loss_clip": 1.02462757, "balance_loss_mlp": 1.02774501, "epoch": 0.1696076957763415, "flos": 29605229637120.0, "grad_norm": 3.091531330886817, "language_loss": 0.78350353, "learning_rate": 3.723039888435485e-06, "loss": 0.80494201, "num_input_tokens_seen": 61146955, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.72265625, "step": 2821, "time_per_iteration": 3.8358261585235596 }, { "auxiliary_loss_clip": 0.01102246, "auxiliary_loss_mlp": 0.01045946, "balance_loss_clip": 1.02509594, "balance_loss_mlp": 1.03072417, "epoch": 0.16966781902900946, "flos": 24387460012800.0, "grad_norm": 1.9821330134851807, "language_loss": 0.78271604, "learning_rate": 3.722848028329882e-06, "loss": 0.80419791, "num_input_tokens_seen": 61166605, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.71484375, "step": 2822, "time_per_iteration": 3.8631176948547363 }, { "auxiliary_loss_clip": 0.01099396, "auxiliary_loss_mlp": 0.01037836, "balance_loss_clip": 1.01911986, "balance_loss_mlp": 1.02830386, "epoch": 0.16972794228167745, "flos": 23037717784320.0, "grad_norm": 4.088592431205343, "language_loss": 0.75136393, "learning_rate": 3.7226561067402638e-06, "loss": 0.77273631, "num_input_tokens_seen": 61186535, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.7109375, "step": 2823, "time_per_iteration": 3.7679522037506104 }, { "auxiliary_loss_clip": 0.01100015, "auxiliary_loss_mlp": 0.01039195, "balance_loss_clip": 1.01944184, "balance_loss_mlp": 1.0288341, "epoch": 0.16978806553434542, "flos": 35227153693440.0, "grad_norm": 2.161818085815661, "language_loss": 0.60268676, "learning_rate": 3.7224641236734805e-06, "loss": 0.62407881, "num_input_tokens_seen": 61208965, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.7109375, "step": 2824, "time_per_iteration": 2.519198179244995 }, { "auxiliary_loss_clip": 0.01099241, "auxiliary_loss_mlp": 0.01039886, "balance_loss_clip": 1.0196799, "balance_loss_mlp": 1.02819765, "epoch": 0.16984818878701338, "flos": 32012944928640.0, "grad_norm": 1.6091365944583946, "language_loss": 0.73137844, "learning_rate": 3.7222720791363837e-06, "loss": 0.75276971, "num_input_tokens_seen": 61230670, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.7109375, "step": 2825, "time_per_iteration": 2.457822561264038 }, { "auxiliary_loss_clip": 0.01103609, "auxiliary_loss_mlp": 0.0104685, "balance_loss_clip": 1.02480817, "balance_loss_mlp": 1.0285238, "epoch": 0.16990831203968135, "flos": 22817555550720.0, "grad_norm": 2.0299478360072247, "language_loss": 0.85285699, "learning_rate": 3.7220799731358264e-06, "loss": 0.87436152, "num_input_tokens_seen": 61249510, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.75, "step": 2826, "time_per_iteration": 2.3784821033477783 }, { "auxiliary_loss_clip": 0.01105385, "auxiliary_loss_mlp": 0.01045185, "balance_loss_clip": 1.02478862, "balance_loss_mlp": 1.0288589, "epoch": 0.1699684352923493, "flos": 23038485834240.0, "grad_norm": 1.7377436091686924, "language_loss": 0.82375735, "learning_rate": 3.721887805678665e-06, "loss": 0.84526312, "num_input_tokens_seen": 61269440, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.765625, "step": 2827, "time_per_iteration": 2.3947606086730957 }, { "auxiliary_loss_clip": 0.01102699, "auxiliary_loss_mlp": 0.01037606, "balance_loss_clip": 1.01596928, "balance_loss_mlp": 1.02850986, "epoch": 0.17002855854501728, "flos": 21433039741440.0, "grad_norm": 1.7688978957597494, "language_loss": 0.73898339, "learning_rate": 3.7216955767717558e-06, "loss": 0.76038647, "num_input_tokens_seen": 61288195, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.7421875, "step": 2828, "time_per_iteration": 2.3846895694732666 }, { "auxiliary_loss_clip": 0.0102301, "auxiliary_loss_mlp": 0.01005238, "balance_loss_clip": 1.00283051, "balance_loss_mlp": 1.00294185, "epoch": 0.17008868179768524, "flos": 71450099585280.0, "grad_norm": 0.7636207272108545, "language_loss": 0.56493086, "learning_rate": 3.721503286421961e-06, "loss": 0.58521336, "num_input_tokens_seen": 61350850, "router_z_loss_clip": 0.02404785, "router_z_loss_mlp": 0.20117188, "step": 2829, "time_per_iteration": 3.060309648513794 }, { "auxiliary_loss_clip": 0.01100291, "auxiliary_loss_mlp": 0.01034828, "balance_loss_clip": 1.01595736, "balance_loss_mlp": 1.02777958, "epoch": 0.17014880505035324, "flos": 24899147032320.0, "grad_norm": 1.9485488632433958, "language_loss": 0.83049953, "learning_rate": 3.7213109346361424e-06, "loss": 0.85185075, "num_input_tokens_seen": 61370765, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.7265625, "step": 2830, "time_per_iteration": 2.4133338928222656 }, { "auxiliary_loss_clip": 0.01100308, "auxiliary_loss_mlp": 0.01036126, "balance_loss_clip": 1.01582432, "balance_loss_mlp": 1.02769089, "epoch": 0.1702089283030212, "flos": 29861108058240.0, "grad_norm": 1.8773353166213922, "language_loss": 0.78348982, "learning_rate": 3.721118521421164e-06, "loss": 0.80485415, "num_input_tokens_seen": 61388935, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.7265625, "step": 2831, "time_per_iteration": 2.445157527923584 }, { "auxiliary_loss_clip": 0.0110162, "auxiliary_loss_mlp": 0.01043851, "balance_loss_clip": 1.02229834, "balance_loss_mlp": 1.02805817, "epoch": 0.17026905155568917, "flos": 17743348903680.0, "grad_norm": 2.739917273717214, "language_loss": 0.79639959, "learning_rate": 3.7209260467838926e-06, "loss": 0.81785429, "num_input_tokens_seen": 61407350, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 0.734375, "step": 2832, "time_per_iteration": 2.355471611022949 }, { "auxiliary_loss_clip": 0.01100581, "auxiliary_loss_mlp": 0.01042989, "balance_loss_clip": 1.02341509, "balance_loss_mlp": 1.0275898, "epoch": 0.17032917480835713, "flos": 23147554521600.0, "grad_norm": 1.6157854847826956, "language_loss": 0.8847543, "learning_rate": 3.720733510731198e-06, "loss": 0.90619004, "num_input_tokens_seen": 61429010, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.73046875, "step": 2833, "time_per_iteration": 2.4187119007110596 }, { "auxiliary_loss_clip": 0.01099931, "auxiliary_loss_mlp": 0.01042799, "balance_loss_clip": 1.02253354, "balance_loss_mlp": 1.02734184, "epoch": 0.1703892980610251, "flos": 39201003020160.0, "grad_norm": 2.2330127490136915, "language_loss": 0.71865654, "learning_rate": 3.72054091326995e-06, "loss": 0.74008387, "num_input_tokens_seen": 61450040, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.7265625, "step": 2834, "time_per_iteration": 2.5245070457458496 }, { "auxiliary_loss_clip": 0.01104221, "auxiliary_loss_mlp": 0.01044854, "balance_loss_clip": 1.02518415, "balance_loss_mlp": 1.03048635, "epoch": 0.17044942131369306, "flos": 23037997075200.0, "grad_norm": 2.04637989842674, "language_loss": 0.86782855, "learning_rate": 3.7203482544070227e-06, "loss": 0.8893193, "num_input_tokens_seen": 61468585, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.73828125, "step": 2835, "time_per_iteration": 2.3873298168182373 }, { "auxiliary_loss_clip": 0.0110199, "auxiliary_loss_mlp": 0.0104297, "balance_loss_clip": 1.02076173, "balance_loss_mlp": 1.0270679, "epoch": 0.17050954456636103, "flos": 17054058464640.0, "grad_norm": 2.074709036086154, "language_loss": 0.73609614, "learning_rate": 3.720155534149292e-06, "loss": 0.75754571, "num_input_tokens_seen": 61486330, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 0.75, "step": 2836, "time_per_iteration": 2.3513550758361816 }, { "auxiliary_loss_clip": 0.01107355, "auxiliary_loss_mlp": 0.01040434, "balance_loss_clip": 1.01742601, "balance_loss_mlp": 1.02962399, "epoch": 0.17056966781902902, "flos": 16836025824000.0, "grad_norm": 2.084422244258617, "language_loss": 0.80140126, "learning_rate": 3.7199627525036343e-06, "loss": 0.8228792, "num_input_tokens_seen": 61503950, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 0.77734375, "step": 2837, "time_per_iteration": 2.3739218711853027 }, { "auxiliary_loss_clip": 0.01098566, "auxiliary_loss_mlp": 0.01039546, "balance_loss_clip": 1.01877975, "balance_loss_mlp": 1.02908397, "epoch": 0.17062979107169698, "flos": 17711577699840.0, "grad_norm": 9.587808322010051, "language_loss": 0.83528239, "learning_rate": 3.7197699094769303e-06, "loss": 0.85666353, "num_input_tokens_seen": 61523550, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.6953125, "step": 2838, "time_per_iteration": 2.3722033500671387 }, { "auxiliary_loss_clip": 0.01098691, "auxiliary_loss_mlp": 0.01034909, "balance_loss_clip": 1.01614571, "balance_loss_mlp": 1.02855921, "epoch": 0.17068991432436495, "flos": 22524040817280.0, "grad_norm": 1.7594720877797905, "language_loss": 0.93555927, "learning_rate": 3.719577005076062e-06, "loss": 0.95689523, "num_input_tokens_seen": 61542720, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.703125, "step": 2839, "time_per_iteration": 2.414508581161499 }, { "auxiliary_loss_clip": 0.01102511, "auxiliary_loss_mlp": 0.01039726, "balance_loss_clip": 1.01892352, "balance_loss_mlp": 1.02878416, "epoch": 0.17075003757703291, "flos": 25881812559360.0, "grad_norm": 2.486884554266925, "language_loss": 0.83609664, "learning_rate": 3.719384039307914e-06, "loss": 0.85751897, "num_input_tokens_seen": 61563040, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.734375, "step": 2840, "time_per_iteration": 2.414292573928833 }, { "auxiliary_loss_clip": 0.01102081, "auxiliary_loss_mlp": 0.01041209, "balance_loss_clip": 1.01995409, "balance_loss_mlp": 1.02834702, "epoch": 0.17081016082970088, "flos": 20119677016320.0, "grad_norm": 1.9077523811333352, "language_loss": 0.75979531, "learning_rate": 3.7191910121793723e-06, "loss": 0.78122818, "num_input_tokens_seen": 61581890, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.734375, "step": 2841, "time_per_iteration": 2.3861541748046875 }, { "auxiliary_loss_clip": 0.0109873, "auxiliary_loss_mlp": 0.0104296, "balance_loss_clip": 1.02227688, "balance_loss_mlp": 1.02622223, "epoch": 0.17087028408236885, "flos": 24935317067520.0, "grad_norm": 1.8118609133322574, "language_loss": 0.76893795, "learning_rate": 3.718997923697326e-06, "loss": 0.79035485, "num_input_tokens_seen": 61602095, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.7265625, "step": 2842, "time_per_iteration": 2.4093101024627686 }, { "auxiliary_loss_clip": 0.01098277, "auxiliary_loss_mlp": 0.01038635, "balance_loss_clip": 1.01902461, "balance_loss_mlp": 1.02864337, "epoch": 0.17093040733503684, "flos": 19056990919680.0, "grad_norm": 1.96747438457974, "language_loss": 0.85524523, "learning_rate": 3.7188047738686655e-06, "loss": 0.87661433, "num_input_tokens_seen": 61620400, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.6953125, "step": 2843, "time_per_iteration": 2.4019691944122314 }, { "auxiliary_loss_clip": 0.01098306, "auxiliary_loss_mlp": 0.01040205, "balance_loss_clip": 1.02007055, "balance_loss_mlp": 1.02914369, "epoch": 0.1709905305877048, "flos": 13078114456320.0, "grad_norm": 1.7886432537207648, "language_loss": 0.68202627, "learning_rate": 3.7186115627002837e-06, "loss": 0.70341146, "num_input_tokens_seen": 61637680, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.69140625, "step": 2844, "time_per_iteration": 2.374447822570801 }, { "auxiliary_loss_clip": 0.01101739, "auxiliary_loss_mlp": 0.01043683, "balance_loss_clip": 1.02196276, "balance_loss_mlp": 1.02908492, "epoch": 0.17105065384037277, "flos": 19208304218880.0, "grad_norm": 2.0010483451506085, "language_loss": 0.78770077, "learning_rate": 3.718418290199076e-06, "loss": 0.80915499, "num_input_tokens_seen": 61655630, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.7265625, "step": 2845, "time_per_iteration": 2.3720314502716064 }, { "auxiliary_loss_clip": 0.01097968, "auxiliary_loss_mlp": 0.01039524, "balance_loss_clip": 1.02000928, "balance_loss_mlp": 1.02587223, "epoch": 0.17111077709304073, "flos": 18514196012160.0, "grad_norm": 4.3704053337883755, "language_loss": 0.77804375, "learning_rate": 3.71822495637194e-06, "loss": 0.79941869, "num_input_tokens_seen": 61673475, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.72265625, "step": 2846, "time_per_iteration": 2.3560523986816406 }, { "auxiliary_loss_clip": 0.01099426, "auxiliary_loss_mlp": 0.01040035, "balance_loss_clip": 1.02113986, "balance_loss_mlp": 1.02861381, "epoch": 0.1711709003457087, "flos": 25081498396800.0, "grad_norm": 1.711799864431428, "language_loss": 0.79977489, "learning_rate": 3.7180315612257748e-06, "loss": 0.8211695, "num_input_tokens_seen": 61693370, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.70703125, "step": 2847, "time_per_iteration": 2.428332805633545 }, { "auxiliary_loss_clip": 0.01097876, "auxiliary_loss_mlp": 0.01039338, "balance_loss_clip": 1.02033615, "balance_loss_mlp": 1.02556586, "epoch": 0.17123102359837666, "flos": 17565431281920.0, "grad_norm": 3.0462412738497533, "language_loss": 0.86679769, "learning_rate": 3.7178381047674825e-06, "loss": 0.88816977, "num_input_tokens_seen": 61710820, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.72265625, "step": 2848, "time_per_iteration": 2.3382651805877686 }, { "auxiliary_loss_clip": 0.01100188, "auxiliary_loss_mlp": 0.01039521, "balance_loss_clip": 1.01960135, "balance_loss_mlp": 1.02768493, "epoch": 0.17129114685104463, "flos": 26172534384000.0, "grad_norm": 2.1077203639678475, "language_loss": 0.75360501, "learning_rate": 3.717644587003967e-06, "loss": 0.77500212, "num_input_tokens_seen": 61729855, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.72265625, "step": 2849, "time_per_iteration": 2.4131977558135986 }, { "auxiliary_loss_clip": 0.01026538, "auxiliary_loss_mlp": 0.01004335, "balance_loss_clip": 1.00199831, "balance_loss_mlp": 1.00581324, "epoch": 0.17135127010371262, "flos": 69266212220160.0, "grad_norm": 0.7842330333871769, "language_loss": 0.57423878, "learning_rate": 3.7174510079421347e-06, "loss": 0.59454751, "num_input_tokens_seen": 61790290, "router_z_loss_clip": 0.02331543, "router_z_loss_mlp": 0.20703125, "step": 2850, "time_per_iteration": 3.004063367843628 }, { "auxiliary_loss_clip": 0.01097386, "auxiliary_loss_mlp": 0.01042264, "balance_loss_clip": 1.02221298, "balance_loss_mlp": 1.02785206, "epoch": 0.1714113933563806, "flos": 23548985867520.0, "grad_norm": 3.1580788317294433, "language_loss": 0.80728292, "learning_rate": 3.7172573675888937e-06, "loss": 0.82867938, "num_input_tokens_seen": 61809265, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.6953125, "step": 2851, "time_per_iteration": 2.409008502960205 }, { "auxiliary_loss_clip": 0.01096345, "auxiliary_loss_mlp": 0.01035792, "balance_loss_clip": 1.01668262, "balance_loss_mlp": 1.02716529, "epoch": 0.17147151660904855, "flos": 21141375310080.0, "grad_norm": 6.502278396988303, "language_loss": 0.93110287, "learning_rate": 3.717063665951155e-06, "loss": 0.95242417, "num_input_tokens_seen": 61828980, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.69140625, "step": 2852, "time_per_iteration": 2.3677895069122314 }, { "auxiliary_loss_clip": 0.01099955, "auxiliary_loss_mlp": 0.01041107, "balance_loss_clip": 1.02143705, "balance_loss_mlp": 1.02682829, "epoch": 0.17153163986171652, "flos": 18623893104000.0, "grad_norm": 1.9697959632093773, "language_loss": 0.68919253, "learning_rate": 3.7168699030358305e-06, "loss": 0.71060312, "num_input_tokens_seen": 61847915, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.734375, "step": 2853, "time_per_iteration": 2.3650238513946533 }, { "auxiliary_loss_clip": 0.01103775, "auxiliary_loss_mlp": 0.01041317, "balance_loss_clip": 1.02082443, "balance_loss_mlp": 1.03048611, "epoch": 0.17159176311438448, "flos": 18222287201280.0, "grad_norm": 2.3573506234697623, "language_loss": 0.66342807, "learning_rate": 3.7166760788498355e-06, "loss": 0.68487895, "num_input_tokens_seen": 61865570, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.734375, "step": 2854, "time_per_iteration": 2.346888303756714 }, { "auxiliary_loss_clip": 0.01094329, "auxiliary_loss_mlp": 0.0103849, "balance_loss_clip": 1.0200839, "balance_loss_mlp": 1.02519643, "epoch": 0.17165188636705245, "flos": 20737988928000.0, "grad_norm": 1.7339636954993085, "language_loss": 0.89137179, "learning_rate": 3.716482193400087e-06, "loss": 0.9127, "num_input_tokens_seen": 61883340, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.69140625, "step": 2855, "time_per_iteration": 2.3888208866119385 }, { "auxiliary_loss_clip": 0.01101268, "auxiliary_loss_mlp": 0.0103767, "balance_loss_clip": 1.01755893, "balance_loss_mlp": 1.02768111, "epoch": 0.17171200961972044, "flos": 24898728096000.0, "grad_norm": 1.9672459927595096, "language_loss": 0.82613242, "learning_rate": 3.7162882466935042e-06, "loss": 0.84752178, "num_input_tokens_seen": 61900610, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.734375, "step": 2856, "time_per_iteration": 2.4004223346710205 }, { "auxiliary_loss_clip": 0.01098951, "auxiliary_loss_mlp": 0.0104157, "balance_loss_clip": 1.02188826, "balance_loss_mlp": 1.02765584, "epoch": 0.1717721328723884, "flos": 20156196165120.0, "grad_norm": 2.041226216781192, "language_loss": 0.86407518, "learning_rate": 3.716094238737009e-06, "loss": 0.8854804, "num_input_tokens_seen": 61916795, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.7109375, "step": 2857, "time_per_iteration": 2.368487596511841 }, { "auxiliary_loss_clip": 0.01100271, "auxiliary_loss_mlp": 0.01043538, "balance_loss_clip": 1.02352262, "balance_loss_mlp": 1.02817392, "epoch": 0.17183225612505637, "flos": 23360699571840.0, "grad_norm": 2.6917011832688993, "language_loss": 0.78101349, "learning_rate": 3.715900169537524e-06, "loss": 0.80245161, "num_input_tokens_seen": 61936665, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.72265625, "step": 2858, "time_per_iteration": 2.407489776611328 }, { "auxiliary_loss_clip": 0.01106981, "auxiliary_loss_mlp": 0.01050912, "balance_loss_clip": 1.02846503, "balance_loss_mlp": 1.0280242, "epoch": 0.17189237937772434, "flos": 18113253425280.0, "grad_norm": 2.327169641792259, "language_loss": 0.76883638, "learning_rate": 3.7157060391019767e-06, "loss": 0.79041535, "num_input_tokens_seen": 61954415, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.7890625, "step": 2859, "time_per_iteration": 3.7524237632751465 }, { "auxiliary_loss_clip": 0.01097007, "auxiliary_loss_mlp": 0.01037272, "balance_loss_clip": 1.01747096, "balance_loss_mlp": 1.02755082, "epoch": 0.1719525026303923, "flos": 23257286524800.0, "grad_norm": 1.9563460985952137, "language_loss": 0.76953274, "learning_rate": 3.7155118474372936e-06, "loss": 0.79087549, "num_input_tokens_seen": 61973940, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.6953125, "step": 2860, "time_per_iteration": 2.4103503227233887 }, { "auxiliary_loss_clip": 0.01099755, "auxiliary_loss_mlp": 0.01034111, "balance_loss_clip": 1.01408386, "balance_loss_mlp": 1.02734613, "epoch": 0.17201262588306027, "flos": 20809456214400.0, "grad_norm": 2.818945073790931, "language_loss": 0.81869853, "learning_rate": 3.7153175945504057e-06, "loss": 0.84003723, "num_input_tokens_seen": 61991845, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.7265625, "step": 2861, "time_per_iteration": 3.7697300910949707 }, { "auxiliary_loss_clip": 0.01098907, "auxiliary_loss_mlp": 0.01039077, "balance_loss_clip": 1.0195142, "balance_loss_mlp": 1.02756715, "epoch": 0.17207274913572823, "flos": 20374822298880.0, "grad_norm": 4.648906746694479, "language_loss": 0.85571301, "learning_rate": 3.7151232804482456e-06, "loss": 0.87709284, "num_input_tokens_seen": 62009395, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.71484375, "step": 2862, "time_per_iteration": 3.7209343910217285 }, { "auxiliary_loss_clip": 0.0109487, "auxiliary_loss_mlp": 0.01037687, "balance_loss_clip": 1.01934016, "balance_loss_mlp": 1.02720714, "epoch": 0.17213287238839622, "flos": 26796501936000.0, "grad_norm": 3.680640271375408, "language_loss": 0.78025091, "learning_rate": 3.7149289051377474e-06, "loss": 0.8015765, "num_input_tokens_seen": 62029005, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.67578125, "step": 2863, "time_per_iteration": 3.7889277935028076 }, { "auxiliary_loss_clip": 0.01097385, "auxiliary_loss_mlp": 0.01042709, "balance_loss_clip": 1.02197862, "balance_loss_mlp": 1.02595067, "epoch": 0.1721929956410642, "flos": 26029634722560.0, "grad_norm": 1.6838877162819452, "language_loss": 0.72329086, "learning_rate": 3.714734468625847e-06, "loss": 0.74469173, "num_input_tokens_seen": 62048730, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.71484375, "step": 2864, "time_per_iteration": 2.438499927520752 }, { "auxiliary_loss_clip": 0.01101815, "auxiliary_loss_mlp": 0.01038164, "balance_loss_clip": 1.01929295, "balance_loss_mlp": 1.02860403, "epoch": 0.17225311889373215, "flos": 22272002645760.0, "grad_norm": 2.0219813649855074, "language_loss": 0.72502583, "learning_rate": 3.714539970919485e-06, "loss": 0.74642563, "num_input_tokens_seen": 62069000, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.734375, "step": 2865, "time_per_iteration": 2.3733339309692383 }, { "auxiliary_loss_clip": 0.01100059, "auxiliary_loss_mlp": 0.01040146, "balance_loss_clip": 1.02076221, "balance_loss_mlp": 1.02951944, "epoch": 0.17231324214640012, "flos": 21286718766720.0, "grad_norm": 2.9961208754105124, "language_loss": 0.78705955, "learning_rate": 3.7143454120256017e-06, "loss": 0.80846155, "num_input_tokens_seen": 62086750, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.70703125, "step": 2866, "time_per_iteration": 2.4031829833984375 }, { "auxiliary_loss_clip": 0.01097716, "auxiliary_loss_mlp": 0.01040117, "balance_loss_clip": 1.02022111, "balance_loss_mlp": 1.02698362, "epoch": 0.17237336539906808, "flos": 19679771485440.0, "grad_norm": 1.706005589775413, "language_loss": 0.79743361, "learning_rate": 3.71415079195114e-06, "loss": 0.81881189, "num_input_tokens_seen": 62106240, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.70703125, "step": 2867, "time_per_iteration": 2.3662564754486084 }, { "auxiliary_loss_clip": 0.01097452, "auxiliary_loss_mlp": 0.01035584, "balance_loss_clip": 1.01467443, "balance_loss_mlp": 1.02625537, "epoch": 0.17243348865173605, "flos": 17528702664960.0, "grad_norm": 1.9585180560379911, "language_loss": 0.79336035, "learning_rate": 3.713956110703046e-06, "loss": 0.81469071, "num_input_tokens_seen": 62124895, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.7109375, "step": 2868, "time_per_iteration": 2.3915653228759766 }, { "auxiliary_loss_clip": 0.0110449, "auxiliary_loss_mlp": 0.01038557, "balance_loss_clip": 1.01855421, "balance_loss_mlp": 1.02958059, "epoch": 0.17249361190440402, "flos": 18258876172800.0, "grad_norm": 2.445199233157961, "language_loss": 0.83990276, "learning_rate": 3.713761368288268e-06, "loss": 0.86133319, "num_input_tokens_seen": 62143510, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.75, "step": 2869, "time_per_iteration": 2.3506219387054443 }, { "auxiliary_loss_clip": 0.01100771, "auxiliary_loss_mlp": 0.01042502, "balance_loss_clip": 1.02104473, "balance_loss_mlp": 1.02764046, "epoch": 0.172553735157072, "flos": 21173425804800.0, "grad_norm": 1.7938640814322828, "language_loss": 0.76781571, "learning_rate": 3.713566564713754e-06, "loss": 0.78924841, "num_input_tokens_seen": 62162285, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.73046875, "step": 2870, "time_per_iteration": 2.4121081829071045 }, { "auxiliary_loss_clip": 0.01094729, "auxiliary_loss_mlp": 0.01034617, "balance_loss_clip": 1.0175581, "balance_loss_mlp": 1.02771533, "epoch": 0.17261385840973997, "flos": 22272177202560.0, "grad_norm": 1.7992539394948617, "language_loss": 0.76963258, "learning_rate": 3.7133716999864574e-06, "loss": 0.7909261, "num_input_tokens_seen": 62180970, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.671875, "step": 2871, "time_per_iteration": 2.3836746215820312 }, { "auxiliary_loss_clip": 0.01097193, "auxiliary_loss_mlp": 0.010355, "balance_loss_clip": 1.01587784, "balance_loss_mlp": 1.02676845, "epoch": 0.17267398166240794, "flos": 27921159429120.0, "grad_norm": 2.6200849364685523, "language_loss": 0.7476573, "learning_rate": 3.7131767741133327e-06, "loss": 0.76898426, "num_input_tokens_seen": 62198965, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.703125, "step": 2872, "time_per_iteration": 2.450266122817993 }, { "auxiliary_loss_clip": 0.01095105, "auxiliary_loss_mlp": 0.01039845, "balance_loss_clip": 1.0204612, "balance_loss_mlp": 1.02686155, "epoch": 0.1727341049150759, "flos": 21944028533760.0, "grad_norm": 6.242095849254738, "language_loss": 0.82088262, "learning_rate": 3.712981787101335e-06, "loss": 0.84223211, "num_input_tokens_seen": 62219890, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.6796875, "step": 2873, "time_per_iteration": 2.385953903198242 }, { "auxiliary_loss_clip": 0.01097729, "auxiliary_loss_mlp": 0.01035709, "balance_loss_clip": 1.01587296, "balance_loss_mlp": 1.0280807, "epoch": 0.17279422816774387, "flos": 18107074114560.0, "grad_norm": 2.0824944934024656, "language_loss": 0.74705172, "learning_rate": 3.7127867389574244e-06, "loss": 0.76838607, "num_input_tokens_seen": 62237140, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.6953125, "step": 2874, "time_per_iteration": 2.3947160243988037 }, { "auxiliary_loss_clip": 0.01097361, "auxiliary_loss_mlp": 0.01040762, "balance_loss_clip": 1.01957846, "balance_loss_mlp": 1.02649164, "epoch": 0.17285435142041183, "flos": 21834366353280.0, "grad_norm": 2.0074995783007985, "language_loss": 0.80613792, "learning_rate": 3.7125916296885606e-06, "loss": 0.82751918, "num_input_tokens_seen": 62255405, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.7109375, "step": 2875, "time_per_iteration": 2.3952105045318604 }, { "auxiliary_loss_clip": 0.01100966, "auxiliary_loss_mlp": 0.01041017, "balance_loss_clip": 1.01912999, "balance_loss_mlp": 1.02759469, "epoch": 0.17291447467307983, "flos": 18367491012480.0, "grad_norm": 2.8247080586996716, "language_loss": 0.87094033, "learning_rate": 3.7123964593017066e-06, "loss": 0.89236015, "num_input_tokens_seen": 62271280, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.734375, "step": 2876, "time_per_iteration": 2.377943515777588 }, { "auxiliary_loss_clip": 0.01097734, "auxiliary_loss_mlp": 0.01039503, "balance_loss_clip": 1.02060854, "balance_loss_mlp": 1.0285635, "epoch": 0.1729745979257478, "flos": 18623648724480.0, "grad_norm": 1.839598799780756, "language_loss": 0.84719235, "learning_rate": 3.7122012278038285e-06, "loss": 0.86856472, "num_input_tokens_seen": 62289140, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.69140625, "step": 2877, "time_per_iteration": 2.348487138748169 }, { "auxiliary_loss_clip": 0.01100964, "auxiliary_loss_mlp": 0.01036013, "balance_loss_clip": 1.01605749, "balance_loss_mlp": 1.02966297, "epoch": 0.17303472117841576, "flos": 22997253651840.0, "grad_norm": 2.2625307315465935, "language_loss": 0.79290515, "learning_rate": 3.7120059352018922e-06, "loss": 0.81427491, "num_input_tokens_seen": 62307490, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.7109375, "step": 2878, "time_per_iteration": 2.4186136722564697 }, { "auxiliary_loss_clip": 0.01095456, "auxiliary_loss_mlp": 0.01037214, "balance_loss_clip": 1.01696002, "balance_loss_mlp": 1.02593291, "epoch": 0.17309484443108372, "flos": 25663256248320.0, "grad_norm": 1.7061990498952924, "language_loss": 0.70197231, "learning_rate": 3.7118105815028677e-06, "loss": 0.72329903, "num_input_tokens_seen": 62328570, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.6953125, "step": 2879, "time_per_iteration": 2.4180474281311035 }, { "auxiliary_loss_clip": 0.01097694, "auxiliary_loss_mlp": 0.01041854, "balance_loss_clip": 1.02081323, "balance_loss_mlp": 1.02637446, "epoch": 0.1731549676837517, "flos": 13552060429440.0, "grad_norm": 2.0531514935022184, "language_loss": 0.8334012, "learning_rate": 3.7116151667137272e-06, "loss": 0.85479665, "num_input_tokens_seen": 62345735, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.71484375, "step": 2880, "time_per_iteration": 2.375086545944214 }, { "auxiliary_loss_clip": 0.01102314, "auxiliary_loss_mlp": 0.01036382, "balance_loss_clip": 1.01543653, "balance_loss_mlp": 1.02918601, "epoch": 0.17321509093641965, "flos": 22855959912960.0, "grad_norm": 2.0841091537860934, "language_loss": 0.80595112, "learning_rate": 3.7114196908414444e-06, "loss": 0.8273381, "num_input_tokens_seen": 62365525, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.734375, "step": 2881, "time_per_iteration": 2.3861749172210693 }, { "auxiliary_loss_clip": 0.01099988, "auxiliary_loss_mlp": 0.01034552, "balance_loss_clip": 1.01553822, "balance_loss_mlp": 1.02886462, "epoch": 0.17327521418908762, "flos": 24351639091200.0, "grad_norm": 2.6595247932454313, "language_loss": 0.77453423, "learning_rate": 3.7112241538929946e-06, "loss": 0.7958796, "num_input_tokens_seen": 62385160, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.7109375, "step": 2882, "time_per_iteration": 2.4411187171936035 }, { "auxiliary_loss_clip": 0.01096081, "auxiliary_loss_mlp": 0.01035144, "balance_loss_clip": 1.01562929, "balance_loss_mlp": 1.02655911, "epoch": 0.1733353374417556, "flos": 33104364940800.0, "grad_norm": 1.8538684903891403, "language_loss": 0.76314259, "learning_rate": 3.711028555875357e-06, "loss": 0.78445482, "num_input_tokens_seen": 62405280, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.6953125, "step": 2883, "time_per_iteration": 2.4880666732788086 }, { "auxiliary_loss_clip": 0.01097012, "auxiliary_loss_mlp": 0.01035835, "balance_loss_clip": 1.01678514, "balance_loss_mlp": 1.02765751, "epoch": 0.17339546069442358, "flos": 24387809126400.0, "grad_norm": 1.8911862531878234, "language_loss": 0.85407919, "learning_rate": 3.7108328967955113e-06, "loss": 0.87540758, "num_input_tokens_seen": 62423665, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.6953125, "step": 2884, "time_per_iteration": 2.4213695526123047 }, { "auxiliary_loss_clip": 0.01098005, "auxiliary_loss_mlp": 0.0104383, "balance_loss_clip": 1.02525759, "balance_loss_mlp": 1.02772915, "epoch": 0.17345558394709154, "flos": 27452938919040.0, "grad_norm": 2.875467068049921, "language_loss": 0.74540174, "learning_rate": 3.7106371766604408e-06, "loss": 0.76682007, "num_input_tokens_seen": 62445170, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.703125, "step": 2885, "time_per_iteration": 2.422321319580078 }, { "auxiliary_loss_clip": 0.01093593, "auxiliary_loss_mlp": 0.01034796, "balance_loss_clip": 1.01702213, "balance_loss_mlp": 1.02822804, "epoch": 0.1735157071997595, "flos": 24680974746240.0, "grad_norm": 1.5230780813505223, "language_loss": 0.70776856, "learning_rate": 3.7104413954771294e-06, "loss": 0.72905242, "num_input_tokens_seen": 62466135, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.65234375, "step": 2886, "time_per_iteration": 2.4529497623443604 }, { "auxiliary_loss_clip": 0.0109737, "auxiliary_loss_mlp": 0.01034553, "balance_loss_clip": 1.01472795, "balance_loss_mlp": 1.02688885, "epoch": 0.17357583045242747, "flos": 21687870821760.0, "grad_norm": 2.549200029262177, "language_loss": 0.69418108, "learning_rate": 3.710245553252564e-06, "loss": 0.7155003, "num_input_tokens_seen": 62483910, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.703125, "step": 2887, "time_per_iteration": 2.4095358848571777 }, { "auxiliary_loss_clip": 0.01098123, "auxiliary_loss_mlp": 0.01041994, "balance_loss_clip": 1.02282476, "balance_loss_mlp": 1.02799964, "epoch": 0.17363595370509544, "flos": 15374875847040.0, "grad_norm": 1.8093871820287766, "language_loss": 0.853176, "learning_rate": 3.7100496499937345e-06, "loss": 0.87457716, "num_input_tokens_seen": 62501530, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.703125, "step": 2888, "time_per_iteration": 2.3673512935638428 }, { "auxiliary_loss_clip": 0.01097958, "auxiliary_loss_mlp": 0.01039942, "balance_loss_clip": 1.01965261, "balance_loss_mlp": 1.02668357, "epoch": 0.1736960769577634, "flos": 23439812446080.0, "grad_norm": 2.7465663528092237, "language_loss": 0.78378886, "learning_rate": 3.7098536857076315e-06, "loss": 0.80516785, "num_input_tokens_seen": 62521295, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.7109375, "step": 2889, "time_per_iteration": 2.4165608882904053 }, { "auxiliary_loss_clip": 0.01094021, "auxiliary_loss_mlp": 0.01035741, "balance_loss_clip": 1.01685858, "balance_loss_mlp": 1.0275017, "epoch": 0.1737562002104314, "flos": 18586850284800.0, "grad_norm": 2.261143993961399, "language_loss": 0.83725846, "learning_rate": 3.7096576604012492e-06, "loss": 0.85855603, "num_input_tokens_seen": 62539615, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.6640625, "step": 2890, "time_per_iteration": 2.373586893081665 }, { "auxiliary_loss_clip": 0.01099012, "auxiliary_loss_mlp": 0.01044074, "balance_loss_clip": 1.02492881, "balance_loss_mlp": 1.02829003, "epoch": 0.17381632346309936, "flos": 15997132742400.0, "grad_norm": 2.1077282277956457, "language_loss": 0.82070744, "learning_rate": 3.7094615740815824e-06, "loss": 0.84213829, "num_input_tokens_seen": 62556820, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.70703125, "step": 2891, "time_per_iteration": 2.340186595916748 }, { "auxiliary_loss_clip": 0.01098339, "auxiliary_loss_mlp": 0.01034982, "balance_loss_clip": 1.01451373, "balance_loss_mlp": 1.02644587, "epoch": 0.17387644671576732, "flos": 13369010837760.0, "grad_norm": 1.9985918358305603, "language_loss": 0.80757391, "learning_rate": 3.709265426755629e-06, "loss": 0.82890713, "num_input_tokens_seen": 62572450, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.71875, "step": 2892, "time_per_iteration": 2.343317985534668 }, { "auxiliary_loss_clip": 0.01100705, "auxiliary_loss_mlp": 0.01040469, "balance_loss_clip": 1.01998901, "balance_loss_mlp": 1.0291729, "epoch": 0.1739365699684353, "flos": 26614290216960.0, "grad_norm": 2.924729453432933, "language_loss": 0.74330664, "learning_rate": 3.7090692184303894e-06, "loss": 0.76471835, "num_input_tokens_seen": 62592580, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.71484375, "step": 2893, "time_per_iteration": 2.4177193641662598 }, { "auxiliary_loss_clip": 0.01099165, "auxiliary_loss_mlp": 0.01040929, "balance_loss_clip": 1.02050877, "balance_loss_mlp": 1.02689755, "epoch": 0.17399669322110325, "flos": 23366843971200.0, "grad_norm": 1.9308889919567662, "language_loss": 0.82883406, "learning_rate": 3.7088729491128665e-06, "loss": 0.85023499, "num_input_tokens_seen": 62611220, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.72265625, "step": 2894, "time_per_iteration": 2.4008944034576416 }, { "auxiliary_loss_clip": 0.010986, "auxiliary_loss_mlp": 0.0103375, "balance_loss_clip": 1.01236343, "balance_loss_mlp": 1.02747357, "epoch": 0.17405681647377122, "flos": 22053027398400.0, "grad_norm": 4.187679183561046, "language_loss": 0.74383038, "learning_rate": 3.708676618810063e-06, "loss": 0.76515388, "num_input_tokens_seen": 62629185, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.7109375, "step": 2895, "time_per_iteration": 2.3879427909851074 }, { "auxiliary_loss_clip": 0.01029121, "auxiliary_loss_mlp": 0.01007236, "balance_loss_clip": 1.00432754, "balance_loss_mlp": 1.00671434, "epoch": 0.1741169397264392, "flos": 61454396044800.0, "grad_norm": 0.8721027012547796, "language_loss": 0.62732995, "learning_rate": 3.7084802275289866e-06, "loss": 0.64769351, "num_input_tokens_seen": 62691895, "router_z_loss_clip": 0.02905273, "router_z_loss_mlp": 0.22460938, "step": 2896, "time_per_iteration": 3.1353063583374023 }, { "auxiliary_loss_clip": 0.01096297, "auxiliary_loss_mlp": 0.01033438, "balance_loss_clip": 1.01472223, "balance_loss_mlp": 1.02518332, "epoch": 0.17417706297910718, "flos": 27016419790080.0, "grad_norm": 2.159513337646916, "language_loss": 0.75981808, "learning_rate": 3.708283775276645e-06, "loss": 0.78111547, "num_input_tokens_seen": 62713790, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.7109375, "step": 2897, "time_per_iteration": 2.430640935897827 }, { "auxiliary_loss_clip": 0.01096242, "auxiliary_loss_mlp": 0.0103623, "balance_loss_clip": 1.01592231, "balance_loss_mlp": 1.02759027, "epoch": 0.17423718623177514, "flos": 33507506943360.0, "grad_norm": 2.226953318823505, "language_loss": 0.69583464, "learning_rate": 3.70808726206005e-06, "loss": 0.71715933, "num_input_tokens_seen": 62736285, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.6875, "step": 2898, "time_per_iteration": 3.8754465579986572 }, { "auxiliary_loss_clip": 0.01102547, "auxiliary_loss_mlp": 0.01040489, "balance_loss_clip": 1.01961541, "balance_loss_mlp": 1.02899253, "epoch": 0.1742973094844431, "flos": 27197409611520.0, "grad_norm": 2.4331251828178315, "language_loss": 0.76197898, "learning_rate": 3.7078906878862145e-06, "loss": 0.78340936, "num_input_tokens_seen": 62756240, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.734375, "step": 2899, "time_per_iteration": 2.4297640323638916 }, { "auxiliary_loss_clip": 0.01094865, "auxiliary_loss_mlp": 0.01038535, "balance_loss_clip": 1.01890135, "balance_loss_mlp": 1.02577353, "epoch": 0.17435743273711107, "flos": 22709638938240.0, "grad_norm": 1.829918370174299, "language_loss": 0.7232179, "learning_rate": 3.7076940527621536e-06, "loss": 0.74455196, "num_input_tokens_seen": 62775910, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.69140625, "step": 2900, "time_per_iteration": 2.4094789028167725 }, { "auxiliary_loss_clip": 0.01101356, "auxiliary_loss_mlp": 0.01043212, "balance_loss_clip": 1.02312541, "balance_loss_mlp": 1.0292275, "epoch": 0.17441755598977904, "flos": 41644853435520.0, "grad_norm": 1.6640230087428245, "language_loss": 0.69881225, "learning_rate": 3.707497356694884e-06, "loss": 0.72025788, "num_input_tokens_seen": 62799385, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.72265625, "step": 2901, "time_per_iteration": 5.310842990875244 }, { "auxiliary_loss_clip": 0.01100747, "auxiliary_loss_mlp": 0.01041757, "balance_loss_clip": 1.0216223, "balance_loss_mlp": 1.02748847, "epoch": 0.174477679242447, "flos": 26285862257280.0, "grad_norm": 2.413978229389291, "language_loss": 0.76461095, "learning_rate": 3.707300599691427e-06, "loss": 0.78603601, "num_input_tokens_seen": 62819380, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.734375, "step": 2902, "time_per_iteration": 3.814647674560547 }, { "auxiliary_loss_clip": 0.01099999, "auxiliary_loss_mlp": 0.01043622, "balance_loss_clip": 1.02456009, "balance_loss_mlp": 1.02765155, "epoch": 0.174537802495115, "flos": 17857444826880.0, "grad_norm": 2.1695329462755764, "language_loss": 0.81537986, "learning_rate": 3.7071037817588023e-06, "loss": 0.83681607, "num_input_tokens_seen": 62836205, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.72265625, "step": 2903, "time_per_iteration": 2.345242500305176 }, { "auxiliary_loss_clip": 0.01098288, "auxiliary_loss_mlp": 0.01034931, "balance_loss_clip": 1.01455855, "balance_loss_mlp": 1.02797866, "epoch": 0.17459792574778296, "flos": 16939927630080.0, "grad_norm": 3.2726253911474683, "language_loss": 0.73323435, "learning_rate": 3.706906902904036e-06, "loss": 0.75456655, "num_input_tokens_seen": 62854045, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.703125, "step": 2904, "time_per_iteration": 2.3807260990142822 }, { "auxiliary_loss_clip": 0.01098457, "auxiliary_loss_mlp": 0.01037204, "balance_loss_clip": 1.0170573, "balance_loss_mlp": 1.02728105, "epoch": 0.17465804900045093, "flos": 25518855398400.0, "grad_norm": 1.9188512777035645, "language_loss": 0.64299375, "learning_rate": 3.7067099631341517e-06, "loss": 0.66435039, "num_input_tokens_seen": 62873075, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.7109375, "step": 2905, "time_per_iteration": 2.408696413040161 }, { "auxiliary_loss_clip": 0.01106027, "auxiliary_loss_mlp": 0.01047717, "balance_loss_clip": 1.02572274, "balance_loss_mlp": 1.03002274, "epoch": 0.1747181722531189, "flos": 24128683948800.0, "grad_norm": 1.7097878330468699, "language_loss": 0.7937634, "learning_rate": 3.70651296245618e-06, "loss": 0.81530094, "num_input_tokens_seen": 62892675, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.7578125, "step": 2906, "time_per_iteration": 2.4063754081726074 }, { "auxiliary_loss_clip": 0.01099716, "auxiliary_loss_mlp": 0.01047005, "balance_loss_clip": 1.0270853, "balance_loss_mlp": 1.02893543, "epoch": 0.17477829550578686, "flos": 17747852469120.0, "grad_norm": 1.6778656885318153, "language_loss": 0.80657685, "learning_rate": 3.70631590087715e-06, "loss": 0.82804406, "num_input_tokens_seen": 62910675, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.70703125, "step": 2907, "time_per_iteration": 2.3640167713165283 }, { "auxiliary_loss_clip": 0.01098921, "auxiliary_loss_mlp": 0.01042513, "balance_loss_clip": 1.02270007, "balance_loss_mlp": 1.02678657, "epoch": 0.17483841875845482, "flos": 15376446858240.0, "grad_norm": 2.761525710535151, "language_loss": 0.80839372, "learning_rate": 3.706118778404095e-06, "loss": 0.82980806, "num_input_tokens_seen": 62928130, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.72265625, "step": 2908, "time_per_iteration": 2.372986078262329 }, { "auxiliary_loss_clip": 0.0109838, "auxiliary_loss_mlp": 0.01037755, "balance_loss_clip": 1.017609, "balance_loss_mlp": 1.02853179, "epoch": 0.17489854201112282, "flos": 17162359102080.0, "grad_norm": 2.5263439339312135, "language_loss": 0.80055851, "learning_rate": 3.7059215950440487e-06, "loss": 0.8219198, "num_input_tokens_seen": 62944290, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.69921875, "step": 2909, "time_per_iteration": 2.3411989212036133 }, { "auxiliary_loss_clip": 0.01097792, "auxiliary_loss_mlp": 0.01038519, "balance_loss_clip": 1.01776505, "balance_loss_mlp": 1.02661288, "epoch": 0.17495866526379078, "flos": 19754276060160.0, "grad_norm": 2.032621032577187, "language_loss": 0.76930559, "learning_rate": 3.7057243508040494e-06, "loss": 0.79066873, "num_input_tokens_seen": 62963505, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.71484375, "step": 2910, "time_per_iteration": 2.3876938819885254 }, { "auxiliary_loss_clip": 0.01100046, "auxiliary_loss_mlp": 0.01041668, "balance_loss_clip": 1.02018619, "balance_loss_mlp": 1.02748728, "epoch": 0.17501878851645875, "flos": 28509899552640.0, "grad_norm": 2.720788766012272, "language_loss": 0.87412465, "learning_rate": 3.7055270456911354e-06, "loss": 0.89554185, "num_input_tokens_seen": 62985020, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.7265625, "step": 2911, "time_per_iteration": 2.4427218437194824 }, { "auxiliary_loss_clip": 0.01097482, "auxiliary_loss_mlp": 0.01043231, "balance_loss_clip": 1.02215457, "balance_loss_mlp": 1.02574348, "epoch": 0.1750789117691267, "flos": 17930238744960.0, "grad_norm": 2.365782983215061, "language_loss": 0.89540219, "learning_rate": 3.7053296797123485e-06, "loss": 0.91680932, "num_input_tokens_seen": 63001745, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.71875, "step": 2912, "time_per_iteration": 2.364178419113159 }, { "auxiliary_loss_clip": 0.01098745, "auxiliary_loss_mlp": 0.01041352, "balance_loss_clip": 1.01925063, "balance_loss_mlp": 1.02638698, "epoch": 0.17513903502179468, "flos": 18258457236480.0, "grad_norm": 1.9260236057718623, "language_loss": 0.7252481, "learning_rate": 3.7051322528747313e-06, "loss": 0.74664903, "num_input_tokens_seen": 63019750, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.72265625, "step": 2913, "time_per_iteration": 2.3827435970306396 }, { "auxiliary_loss_clip": 0.01096951, "auxiliary_loss_mlp": 0.01034689, "balance_loss_clip": 1.0148052, "balance_loss_mlp": 1.02793205, "epoch": 0.17519915827446264, "flos": 20703669194880.0, "grad_norm": 1.6688855674381564, "language_loss": 0.68798614, "learning_rate": 3.704934765185331e-06, "loss": 0.70930254, "num_input_tokens_seen": 63039500, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.6875, "step": 2914, "time_per_iteration": 2.417940616607666 }, { "auxiliary_loss_clip": 0.01095125, "auxiliary_loss_mlp": 0.0103836, "balance_loss_clip": 1.01779675, "balance_loss_mlp": 1.02606404, "epoch": 0.1752592815271306, "flos": 20522330259840.0, "grad_norm": 1.706189824721809, "language_loss": 0.93502462, "learning_rate": 3.7047372166511945e-06, "loss": 0.95635939, "num_input_tokens_seen": 63059785, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.69140625, "step": 2915, "time_per_iteration": 2.3981058597564697 }, { "auxiliary_loss_clip": 0.01093396, "auxiliary_loss_mlp": 0.01034579, "balance_loss_clip": 1.01422966, "balance_loss_mlp": 1.02555728, "epoch": 0.1753194047797986, "flos": 21798091584000.0, "grad_norm": 1.65925447995177, "language_loss": 0.80993646, "learning_rate": 3.704539607279371e-06, "loss": 0.83121622, "num_input_tokens_seen": 63079385, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.6796875, "step": 2916, "time_per_iteration": 2.411088705062866 }, { "auxiliary_loss_clip": 0.01099869, "auxiliary_loss_mlp": 0.0104288, "balance_loss_clip": 1.02164841, "balance_loss_mlp": 1.02748048, "epoch": 0.17537952803246656, "flos": 20667289691520.0, "grad_norm": 1.5880739721979988, "language_loss": 0.73977023, "learning_rate": 3.704341937076914e-06, "loss": 0.76119775, "num_input_tokens_seen": 63098970, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.72265625, "step": 2917, "time_per_iteration": 2.3958816528320312 }, { "auxiliary_loss_clip": 0.01094786, "auxiliary_loss_mlp": 0.01036699, "balance_loss_clip": 1.01557553, "balance_loss_mlp": 1.02746558, "epoch": 0.17543965128513453, "flos": 23293945319040.0, "grad_norm": 1.9491242793277963, "language_loss": 0.7629177, "learning_rate": 3.7041442060508778e-06, "loss": 0.78423256, "num_input_tokens_seen": 63118750, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.671875, "step": 2918, "time_per_iteration": 2.401737689971924 }, { "auxiliary_loss_clip": 0.01100381, "auxiliary_loss_mlp": 0.01037278, "balance_loss_clip": 1.01521242, "balance_loss_mlp": 1.02643442, "epoch": 0.1754997745378025, "flos": 29094345578880.0, "grad_norm": 3.161543355332977, "language_loss": 0.7428453, "learning_rate": 3.7039464142083183e-06, "loss": 0.76422191, "num_input_tokens_seen": 63136865, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.73828125, "step": 2919, "time_per_iteration": 2.4384982585906982 }, { "auxiliary_loss_clip": 0.01100849, "auxiliary_loss_mlp": 0.01042433, "balance_loss_clip": 1.02054608, "balance_loss_mlp": 1.02537584, "epoch": 0.17555989779047046, "flos": 30370560750720.0, "grad_norm": 2.1375587565939926, "language_loss": 0.74586523, "learning_rate": 3.7037485615562936e-06, "loss": 0.76729798, "num_input_tokens_seen": 63158325, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.75390625, "step": 2920, "time_per_iteration": 2.4775569438934326 }, { "auxiliary_loss_clip": 0.01094259, "auxiliary_loss_mlp": 0.01037155, "balance_loss_clip": 1.01831973, "balance_loss_mlp": 1.02546024, "epoch": 0.17562002104313842, "flos": 23286823401600.0, "grad_norm": 2.1269303189241673, "language_loss": 0.79498994, "learning_rate": 3.703550648101866e-06, "loss": 0.81630409, "num_input_tokens_seen": 63173115, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.6875, "step": 2921, "time_per_iteration": 2.411048650741577 }, { "auxiliary_loss_clip": 0.01102464, "auxiliary_loss_mlp": 0.01043531, "balance_loss_clip": 1.02088106, "balance_loss_mlp": 1.02722764, "epoch": 0.1756801442958064, "flos": 24789345206400.0, "grad_norm": 1.720611101927665, "language_loss": 0.87780988, "learning_rate": 3.7033526738520983e-06, "loss": 0.89926982, "num_input_tokens_seen": 63192880, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.75, "step": 2922, "time_per_iteration": 2.4284417629241943 }, { "auxiliary_loss_clip": 0.01097197, "auxiliary_loss_mlp": 0.01043666, "balance_loss_clip": 1.02282798, "balance_loss_mlp": 1.02513099, "epoch": 0.17574026754847438, "flos": 25770579367680.0, "grad_norm": 2.2000436260039042, "language_loss": 0.62409222, "learning_rate": 3.7031546388140545e-06, "loss": 0.64550078, "num_input_tokens_seen": 63214395, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.71875, "step": 2923, "time_per_iteration": 2.4263389110565186 }, { "auxiliary_loss_clip": 0.01102621, "auxiliary_loss_mlp": 0.01041787, "balance_loss_clip": 1.01973307, "balance_loss_mlp": 1.02732992, "epoch": 0.17580039080114235, "flos": 17455664367360.0, "grad_norm": 2.059515887283728, "language_loss": 0.80213439, "learning_rate": 3.702956542994802e-06, "loss": 0.82357854, "num_input_tokens_seen": 63231020, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.75390625, "step": 2924, "time_per_iteration": 2.3719711303710938 }, { "auxiliary_loss_clip": 0.0110046, "auxiliary_loss_mlp": 0.01041157, "balance_loss_clip": 1.01857901, "balance_loss_mlp": 1.02674377, "epoch": 0.1758605140538103, "flos": 14863817232000.0, "grad_norm": 3.4252338567491845, "language_loss": 0.7123369, "learning_rate": 3.7027583864014123e-06, "loss": 0.73375309, "num_input_tokens_seen": 63246245, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 0.734375, "step": 2925, "time_per_iteration": 2.3547937870025635 }, { "auxiliary_loss_clip": 0.010994, "auxiliary_loss_mlp": 0.01037619, "balance_loss_clip": 1.01755607, "balance_loss_mlp": 1.02856195, "epoch": 0.17592063730647828, "flos": 23003118760320.0, "grad_norm": 1.7061490438206, "language_loss": 0.71652341, "learning_rate": 3.7025601690409555e-06, "loss": 0.73789358, "num_input_tokens_seen": 63267790, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.70703125, "step": 2926, "time_per_iteration": 2.4420547485351562 }, { "auxiliary_loss_clip": 0.01100969, "auxiliary_loss_mlp": 0.01036308, "balance_loss_clip": 1.01409912, "balance_loss_mlp": 1.02715552, "epoch": 0.17598076055914624, "flos": 20740432723200.0, "grad_norm": 1.8321860126161547, "language_loss": 0.8483274, "learning_rate": 3.702361890920505e-06, "loss": 0.86970007, "num_input_tokens_seen": 63286830, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 0.73828125, "step": 2927, "time_per_iteration": 2.3879168033599854 }, { "auxiliary_loss_clip": 0.0109817, "auxiliary_loss_mlp": 0.01040642, "balance_loss_clip": 1.02124703, "balance_loss_mlp": 1.02745426, "epoch": 0.1760408838118142, "flos": 34091080185600.0, "grad_norm": 1.9231681298547754, "language_loss": 0.7214148, "learning_rate": 3.702163552047138e-06, "loss": 0.74280298, "num_input_tokens_seen": 63308870, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.70703125, "step": 2928, "time_per_iteration": 2.513338804244995 }, { "auxiliary_loss_clip": 0.01095716, "auxiliary_loss_mlp": 0.01038513, "balance_loss_clip": 1.01797342, "balance_loss_mlp": 1.02707195, "epoch": 0.1761010070644822, "flos": 24167297779200.0, "grad_norm": 1.8471882104266197, "language_loss": 0.83402288, "learning_rate": 3.7019651524279326e-06, "loss": 0.85536516, "num_input_tokens_seen": 63329005, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.6875, "step": 2929, "time_per_iteration": 2.391378164291382 }, { "auxiliary_loss_clip": 0.01098179, "auxiliary_loss_mlp": 0.01043888, "balance_loss_clip": 1.02446914, "balance_loss_mlp": 1.02659583, "epoch": 0.17616113031715017, "flos": 26575536741120.0, "grad_norm": 1.5544288993568705, "language_loss": 0.79389054, "learning_rate": 3.7017666920699693e-06, "loss": 0.81531119, "num_input_tokens_seen": 63349390, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.71875, "step": 2930, "time_per_iteration": 2.433692216873169 }, { "auxiliary_loss_clip": 0.01100173, "auxiliary_loss_mlp": 0.01036324, "balance_loss_clip": 1.01531935, "balance_loss_mlp": 1.02801824, "epoch": 0.17622125356981813, "flos": 25665490575360.0, "grad_norm": 2.2015391258894117, "language_loss": 0.77019572, "learning_rate": 3.701568170980329e-06, "loss": 0.79156071, "num_input_tokens_seen": 63368835, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.72265625, "step": 2931, "time_per_iteration": 2.4104647636413574 }, { "auxiliary_loss_clip": 0.01098125, "auxiliary_loss_mlp": 0.01036678, "balance_loss_clip": 1.0169245, "balance_loss_mlp": 1.02678323, "epoch": 0.1762813768224861, "flos": 16507597864320.0, "grad_norm": 2.7257495806613976, "language_loss": 0.74576712, "learning_rate": 3.7013695891660985e-06, "loss": 0.76711518, "num_input_tokens_seen": 63385220, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.71484375, "step": 2932, "time_per_iteration": 2.381547212600708 }, { "auxiliary_loss_clip": 0.01104459, "auxiliary_loss_mlp": 0.01046361, "balance_loss_clip": 1.02371168, "balance_loss_mlp": 1.02852345, "epoch": 0.17634150007515406, "flos": 11211239036160.0, "grad_norm": 2.7797515402489887, "language_loss": 0.89398766, "learning_rate": 3.701170946634364e-06, "loss": 0.91549587, "num_input_tokens_seen": 63400865, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 0.7578125, "step": 2933, "time_per_iteration": 2.340221643447876 }, { "auxiliary_loss_clip": 0.01096472, "auxiliary_loss_mlp": 0.01042185, "balance_loss_clip": 1.02317154, "balance_loss_mlp": 1.02740037, "epoch": 0.17640162332782203, "flos": 23658787693440.0, "grad_norm": 1.7039241733091834, "language_loss": 0.88141811, "learning_rate": 3.700972243392214e-06, "loss": 0.90280473, "num_input_tokens_seen": 63421390, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.69140625, "step": 2934, "time_per_iteration": 2.44293212890625 }, { "auxiliary_loss_clip": 0.01092239, "auxiliary_loss_mlp": 0.01039757, "balance_loss_clip": 1.02050495, "balance_loss_mlp": 1.0247401, "epoch": 0.17646174658049, "flos": 53795012198400.0, "grad_norm": 1.544438353879266, "language_loss": 0.70650262, "learning_rate": 3.70077347944674e-06, "loss": 0.72782254, "num_input_tokens_seen": 63444715, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.67578125, "step": 2935, "time_per_iteration": 2.655700206756592 }, { "auxiliary_loss_clip": 0.01101648, "auxiliary_loss_mlp": 0.01037841, "balance_loss_clip": 1.01676512, "balance_loss_mlp": 1.02728581, "epoch": 0.17652186983315798, "flos": 24242710049280.0, "grad_norm": 2.695729821185055, "language_loss": 0.70003366, "learning_rate": 3.7005746548050353e-06, "loss": 0.72142857, "num_input_tokens_seen": 63465525, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.7421875, "step": 2936, "time_per_iteration": 2.4301464557647705 }, { "auxiliary_loss_clip": 0.01101024, "auxiliary_loss_mlp": 0.01038122, "balance_loss_clip": 1.01888156, "balance_loss_mlp": 1.03025663, "epoch": 0.17658199308582595, "flos": 27453043653120.0, "grad_norm": 1.802780086834324, "language_loss": 0.71520585, "learning_rate": 3.7003757694741956e-06, "loss": 0.7365973, "num_input_tokens_seen": 63485815, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.70703125, "step": 2937, "time_per_iteration": 3.927194118499756 }, { "auxiliary_loss_clip": 0.01102053, "auxiliary_loss_mlp": 0.01042294, "balance_loss_clip": 1.02105069, "balance_loss_mlp": 1.0282346, "epoch": 0.17664211633849392, "flos": 22417590481920.0, "grad_norm": 4.800873366126143, "language_loss": 0.75749171, "learning_rate": 3.7001768234613188e-06, "loss": 0.77893519, "num_input_tokens_seen": 63503905, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.73828125, "step": 2938, "time_per_iteration": 2.4505906105041504 }, { "auxiliary_loss_clip": 0.01098209, "auxiliary_loss_mlp": 0.0103539, "balance_loss_clip": 1.01573217, "balance_loss_mlp": 1.02652121, "epoch": 0.17670223959116188, "flos": 24714037670400.0, "grad_norm": 2.297893445036306, "language_loss": 0.71310973, "learning_rate": 3.6999778167735043e-06, "loss": 0.73444581, "num_input_tokens_seen": 63521985, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.71875, "step": 2939, "time_per_iteration": 2.422348737716675 }, { "auxiliary_loss_clip": 0.01099561, "auxiliary_loss_mlp": 0.01036472, "balance_loss_clip": 1.01658833, "balance_loss_mlp": 1.02866793, "epoch": 0.17676236284382985, "flos": 22525995853440.0, "grad_norm": 2.369397878630063, "language_loss": 0.73411208, "learning_rate": 3.699778749417855e-06, "loss": 0.75547242, "num_input_tokens_seen": 63539830, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.70703125, "step": 2940, "time_per_iteration": 3.8665661811828613 }, { "auxiliary_loss_clip": 0.01098768, "auxiliary_loss_mlp": 0.01037577, "balance_loss_clip": 1.01682305, "balance_loss_mlp": 1.0269599, "epoch": 0.1768224860964978, "flos": 12384355363200.0, "grad_norm": 2.239458217562189, "language_loss": 0.85782027, "learning_rate": 3.699579621401474e-06, "loss": 0.87918377, "num_input_tokens_seen": 63555495, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.71875, "step": 2941, "time_per_iteration": 3.711132287979126 }, { "auxiliary_loss_clip": 0.01097197, "auxiliary_loss_mlp": 0.01032487, "balance_loss_clip": 1.01241255, "balance_loss_mlp": 1.02669907, "epoch": 0.1768826093491658, "flos": 24352197672960.0, "grad_norm": 2.178196462890618, "language_loss": 0.76568735, "learning_rate": 3.699380432731468e-06, "loss": 0.78698421, "num_input_tokens_seen": 63575290, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.703125, "step": 2942, "time_per_iteration": 3.78407621383667 }, { "auxiliary_loss_clip": 0.01098873, "auxiliary_loss_mlp": 0.01038952, "balance_loss_clip": 1.01731586, "balance_loss_mlp": 1.02701664, "epoch": 0.17694273260183377, "flos": 23585923952640.0, "grad_norm": 3.5547954155900796, "language_loss": 0.79881883, "learning_rate": 3.699181183414946e-06, "loss": 0.82019711, "num_input_tokens_seen": 63594670, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.71875, "step": 2943, "time_per_iteration": 2.390895366668701 }, { "auxiliary_loss_clip": 0.01097381, "auxiliary_loss_mlp": 0.01040082, "balance_loss_clip": 1.01857674, "balance_loss_mlp": 1.02504814, "epoch": 0.17700285585450173, "flos": 26759773319040.0, "grad_norm": 2.4635067207059373, "language_loss": 0.80503607, "learning_rate": 3.698981873459018e-06, "loss": 0.82641065, "num_input_tokens_seen": 63614780, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.72265625, "step": 2944, "time_per_iteration": 2.428388833999634 }, { "auxiliary_loss_clip": 0.01096845, "auxiliary_loss_mlp": 0.01047621, "balance_loss_clip": 1.02808249, "balance_loss_mlp": 1.02602494, "epoch": 0.1770629791071697, "flos": 42774712721280.0, "grad_norm": 2.0907109865020654, "language_loss": 0.73149455, "learning_rate": 3.6987825028707976e-06, "loss": 0.75293922, "num_input_tokens_seen": 63637190, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.70703125, "step": 2945, "time_per_iteration": 2.551279067993164 }, { "auxiliary_loss_clip": 0.0109669, "auxiliary_loss_mlp": 0.01039958, "balance_loss_clip": 1.01960874, "balance_loss_mlp": 1.02730441, "epoch": 0.17712310235983766, "flos": 17344675555200.0, "grad_norm": 2.518853551238311, "language_loss": 0.78102767, "learning_rate": 3.698583071657399e-06, "loss": 0.80239409, "num_input_tokens_seen": 63652140, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.6953125, "step": 2946, "time_per_iteration": 2.349885940551758 }, { "auxiliary_loss_clip": 0.01096569, "auxiliary_loss_mlp": 0.01032906, "balance_loss_clip": 1.01380908, "balance_loss_mlp": 1.0272876, "epoch": 0.17718322561250563, "flos": 23877344004480.0, "grad_norm": 2.926468314024668, "language_loss": 0.76134998, "learning_rate": 3.6983835798259404e-06, "loss": 0.78264475, "num_input_tokens_seen": 63671700, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.69140625, "step": 2947, "time_per_iteration": 2.394010543823242 }, { "auxiliary_loss_clip": 0.01094627, "auxiliary_loss_mlp": 0.01039666, "balance_loss_clip": 1.02019954, "balance_loss_mlp": 1.02523696, "epoch": 0.1772433488651736, "flos": 36464859768960.0, "grad_norm": 3.613830340342464, "language_loss": 0.72849512, "learning_rate": 3.6981840273835405e-06, "loss": 0.74983805, "num_input_tokens_seen": 63691685, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.6953125, "step": 2948, "time_per_iteration": 2.5140676498413086 }, { "auxiliary_loss_clip": 0.01096881, "auxiliary_loss_mlp": 0.01034496, "balance_loss_clip": 1.01445663, "balance_loss_mlp": 1.02745891, "epoch": 0.1773034721178416, "flos": 26683592999040.0, "grad_norm": 1.952034909545662, "language_loss": 0.81700194, "learning_rate": 3.6979844143373207e-06, "loss": 0.83831561, "num_input_tokens_seen": 63711720, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.6953125, "step": 2949, "time_per_iteration": 2.415473461151123 }, { "auxiliary_loss_clip": 0.01034546, "auxiliary_loss_mlp": 0.01004541, "balance_loss_clip": 1.0013696, "balance_loss_mlp": 1.01126313, "epoch": 0.17736359537050955, "flos": 57114377712000.0, "grad_norm": 0.8157807708418564, "language_loss": 0.64964092, "learning_rate": 3.6977847406944053e-06, "loss": 0.67003179, "num_input_tokens_seen": 63776280, "router_z_loss_clip": 0.03173828, "router_z_loss_mlp": 0.23242188, "step": 2950, "time_per_iteration": 3.150360345840454 }, { "auxiliary_loss_clip": 0.01094283, "auxiliary_loss_mlp": 0.01034287, "balance_loss_clip": 1.01507056, "balance_loss_mlp": 1.0265522, "epoch": 0.17742371862317752, "flos": 27196990675200.0, "grad_norm": 1.956804133860281, "language_loss": 0.83536267, "learning_rate": 3.6975850064619193e-06, "loss": 0.85664833, "num_input_tokens_seen": 63797535, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.67578125, "step": 2951, "time_per_iteration": 2.4206326007843018 }, { "auxiliary_loss_clip": 0.01097285, "auxiliary_loss_mlp": 0.01038312, "balance_loss_clip": 1.01787972, "balance_loss_mlp": 1.02587032, "epoch": 0.17748384187584548, "flos": 20958639920640.0, "grad_norm": 3.7777157195705753, "language_loss": 0.80479968, "learning_rate": 3.697385211646991e-06, "loss": 0.82615566, "num_input_tokens_seen": 63817045, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.7109375, "step": 2952, "time_per_iteration": 2.4163801670074463 }, { "auxiliary_loss_clip": 0.01095084, "auxiliary_loss_mlp": 0.01032055, "balance_loss_clip": 1.01189661, "balance_loss_mlp": 1.02589631, "epoch": 0.17754396512851345, "flos": 25008809212800.0, "grad_norm": 8.429013698081665, "language_loss": 0.79238909, "learning_rate": 3.697185356256751e-06, "loss": 0.8136605, "num_input_tokens_seen": 63837665, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.69140625, "step": 2953, "time_per_iteration": 2.409651517868042 }, { "auxiliary_loss_clip": 0.01098027, "auxiliary_loss_mlp": 0.01036523, "balance_loss_clip": 1.01796246, "balance_loss_mlp": 1.02777719, "epoch": 0.1776040883811814, "flos": 32050197216000.0, "grad_norm": 1.8421486948225045, "language_loss": 0.88229394, "learning_rate": 3.6969854402983314e-06, "loss": 0.90363944, "num_input_tokens_seen": 63858455, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.703125, "step": 2954, "time_per_iteration": 2.483719825744629 }, { "auxiliary_loss_clip": 0.01097547, "auxiliary_loss_mlp": 0.01041523, "balance_loss_clip": 1.0208286, "balance_loss_mlp": 1.02728677, "epoch": 0.17766421163384938, "flos": 21573216316800.0, "grad_norm": 2.006047180188921, "language_loss": 0.84834766, "learning_rate": 3.6967854637788665e-06, "loss": 0.86973828, "num_input_tokens_seen": 63876935, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.703125, "step": 2955, "time_per_iteration": 2.379448652267456 }, { "auxiliary_loss_clip": 0.01093515, "auxiliary_loss_mlp": 0.0103567, "balance_loss_clip": 1.01706147, "balance_loss_mlp": 1.02667737, "epoch": 0.17772433488651737, "flos": 22418218886400.0, "grad_norm": 2.3926075165858425, "language_loss": 0.70818555, "learning_rate": 3.696585426705493e-06, "loss": 0.72947741, "num_input_tokens_seen": 63896815, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.66796875, "step": 2956, "time_per_iteration": 2.41111421585083 }, { "auxiliary_loss_clip": 0.0109402, "auxiliary_loss_mlp": 0.01038841, "balance_loss_clip": 1.01956499, "balance_loss_mlp": 1.02558947, "epoch": 0.17778445813918534, "flos": 25628273199360.0, "grad_norm": 1.952670960754375, "language_loss": 0.82171714, "learning_rate": 3.6963853290853503e-06, "loss": 0.84304583, "num_input_tokens_seen": 63916140, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.6875, "step": 2957, "time_per_iteration": 2.4054601192474365 }, { "auxiliary_loss_clip": 0.01093597, "auxiliary_loss_mlp": 0.01034734, "balance_loss_clip": 1.01699567, "balance_loss_mlp": 1.02670491, "epoch": 0.1778445813918533, "flos": 25627714617600.0, "grad_norm": 1.7895656136965656, "language_loss": 0.75002372, "learning_rate": 3.6961851709255784e-06, "loss": 0.77130711, "num_input_tokens_seen": 63935220, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.66796875, "step": 2958, "time_per_iteration": 2.432297706604004 }, { "auxiliary_loss_clip": 0.010995, "auxiliary_loss_mlp": 0.01033267, "balance_loss_clip": 1.01464653, "balance_loss_mlp": 1.02971029, "epoch": 0.17790470464452127, "flos": 22344447450240.0, "grad_norm": 2.2348646910534926, "language_loss": 0.80148596, "learning_rate": 3.6959849522333206e-06, "loss": 0.82281363, "num_input_tokens_seen": 63954550, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.69921875, "step": 2959, "time_per_iteration": 2.386784076690674 }, { "auxiliary_loss_clip": 0.01096945, "auxiliary_loss_mlp": 0.01039385, "balance_loss_clip": 1.01919079, "balance_loss_mlp": 1.02686608, "epoch": 0.17796482789718923, "flos": 18765012286080.0, "grad_norm": 1.75423547245717, "language_loss": 0.51365209, "learning_rate": 3.6957846730157222e-06, "loss": 0.53501546, "num_input_tokens_seen": 63972425, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.703125, "step": 2960, "time_per_iteration": 2.3954639434814453 }, { "auxiliary_loss_clip": 0.01100382, "auxiliary_loss_mlp": 0.01044366, "balance_loss_clip": 1.02450538, "balance_loss_mlp": 1.02857542, "epoch": 0.1780249511498572, "flos": 23439812446080.0, "grad_norm": 1.9753834197272402, "language_loss": 0.88879579, "learning_rate": 3.6955843332799317e-06, "loss": 0.91024327, "num_input_tokens_seen": 63992165, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.71875, "step": 2961, "time_per_iteration": 2.3947105407714844 }, { "auxiliary_loss_clip": 0.01097315, "auxiliary_loss_mlp": 0.01043823, "balance_loss_clip": 1.02297306, "balance_loss_mlp": 1.02615452, "epoch": 0.1780850744025252, "flos": 23366355212160.0, "grad_norm": 1.6960386384330346, "language_loss": 0.79236126, "learning_rate": 3.6953839330330972e-06, "loss": 0.81377268, "num_input_tokens_seen": 64013470, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.7109375, "step": 2962, "time_per_iteration": 2.4387855529785156 }, { "auxiliary_loss_clip": 0.01099002, "auxiliary_loss_mlp": 0.01040503, "balance_loss_clip": 1.02032113, "balance_loss_mlp": 1.02963376, "epoch": 0.17814519765519315, "flos": 13771140410880.0, "grad_norm": 2.200560901801987, "language_loss": 0.74530143, "learning_rate": 3.6951834722823715e-06, "loss": 0.76669645, "num_input_tokens_seen": 64030975, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.6953125, "step": 2963, "time_per_iteration": 2.357468366622925 }, { "auxiliary_loss_clip": 0.01098758, "auxiliary_loss_mlp": 0.010387, "balance_loss_clip": 1.01862502, "balance_loss_mlp": 1.0284189, "epoch": 0.17820532090786112, "flos": 21975450624000.0, "grad_norm": 1.658869639699114, "language_loss": 0.78876424, "learning_rate": 3.6949829510349082e-06, "loss": 0.81013888, "num_input_tokens_seen": 64050075, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.703125, "step": 2964, "time_per_iteration": 2.448072671890259 }, { "auxiliary_loss_clip": 0.01097214, "auxiliary_loss_mlp": 0.01040353, "balance_loss_clip": 1.02223301, "balance_loss_mlp": 1.02902925, "epoch": 0.17826544416052909, "flos": 24789589585920.0, "grad_norm": 2.460196336053275, "language_loss": 0.80767095, "learning_rate": 3.6947823692978634e-06, "loss": 0.82904661, "num_input_tokens_seen": 64071920, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.6796875, "step": 2965, "time_per_iteration": 2.414928436279297 }, { "auxiliary_loss_clip": 0.01095953, "auxiliary_loss_mlp": 0.0104086, "balance_loss_clip": 1.02256083, "balance_loss_mlp": 1.02708244, "epoch": 0.17832556741319705, "flos": 13878777732480.0, "grad_norm": 2.4380853617515474, "language_loss": 0.94539225, "learning_rate": 3.6945817270783955e-06, "loss": 0.9667604, "num_input_tokens_seen": 64086835, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.6875, "step": 2966, "time_per_iteration": 2.3657639026641846 }, { "auxiliary_loss_clip": 0.01095675, "auxiliary_loss_mlp": 0.01038397, "balance_loss_clip": 1.01912105, "balance_loss_mlp": 1.02737188, "epoch": 0.17838569066586502, "flos": 36640403418240.0, "grad_norm": 2.6088065706095853, "language_loss": 0.72639889, "learning_rate": 3.6943810243836648e-06, "loss": 0.74773961, "num_input_tokens_seen": 64107360, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.68359375, "step": 2967, "time_per_iteration": 2.495950222015381 }, { "auxiliary_loss_clip": 0.01092959, "auxiliary_loss_mlp": 0.01038678, "balance_loss_clip": 1.01980734, "balance_loss_mlp": 1.02761495, "epoch": 0.17844581391853298, "flos": 18726468278400.0, "grad_norm": 2.019554405112788, "language_loss": 0.77192456, "learning_rate": 3.6941802612208334e-06, "loss": 0.7932409, "num_input_tokens_seen": 64124690, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.65625, "step": 2968, "time_per_iteration": 2.385192632675171 }, { "auxiliary_loss_clip": 0.01096339, "auxiliary_loss_mlp": 0.01039807, "balance_loss_clip": 1.02190185, "balance_loss_mlp": 1.02813447, "epoch": 0.17850593717120097, "flos": 27377107712640.0, "grad_norm": 2.374990649800464, "language_loss": 0.75913197, "learning_rate": 3.6939794375970667e-06, "loss": 0.78049338, "num_input_tokens_seen": 64146315, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.68359375, "step": 2969, "time_per_iteration": 2.4243085384368896 }, { "auxiliary_loss_clip": 0.01040425, "auxiliary_loss_mlp": 0.01015848, "balance_loss_clip": 1.01160467, "balance_loss_mlp": 1.01369655, "epoch": 0.17856606042386894, "flos": 66992913129600.0, "grad_norm": 0.839377654031634, "language_loss": 0.69052625, "learning_rate": 3.693778553519531e-06, "loss": 0.71108902, "num_input_tokens_seen": 64210875, "router_z_loss_clip": 0.04248047, "router_z_loss_mlp": 0.26757812, "step": 2970, "time_per_iteration": 3.1388394832611084 }, { "auxiliary_loss_clip": 0.01099696, "auxiliary_loss_mlp": 0.01035852, "balance_loss_clip": 1.01663566, "balance_loss_mlp": 1.02782619, "epoch": 0.1786261836765369, "flos": 36975499447680.0, "grad_norm": 1.832802303528834, "language_loss": 0.67340553, "learning_rate": 3.6935776089953956e-06, "loss": 0.69476104, "num_input_tokens_seen": 64230740, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.71875, "step": 2971, "time_per_iteration": 2.5004563331604004 }, { "auxiliary_loss_clip": 0.01095868, "auxiliary_loss_mlp": 0.01039161, "balance_loss_clip": 1.01888382, "balance_loss_mlp": 1.02606821, "epoch": 0.17868630692920487, "flos": 24824328255360.0, "grad_norm": 1.695710173332243, "language_loss": 0.89951611, "learning_rate": 3.6933766040318323e-06, "loss": 0.92086643, "num_input_tokens_seen": 64252300, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.6953125, "step": 2972, "time_per_iteration": 2.4530422687530518 }, { "auxiliary_loss_clip": 0.01098219, "auxiliary_loss_mlp": 0.010454, "balance_loss_clip": 1.02532518, "balance_loss_mlp": 1.02798152, "epoch": 0.17874643018187283, "flos": 16981055078400.0, "grad_norm": 2.9073345714967735, "language_loss": 0.87565172, "learning_rate": 3.693175538636014e-06, "loss": 0.89708793, "num_input_tokens_seen": 64270105, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.703125, "step": 2973, "time_per_iteration": 2.346275806427002 }, { "auxiliary_loss_clip": 0.01095117, "auxiliary_loss_mlp": 0.01043492, "balance_loss_clip": 1.02205861, "balance_loss_mlp": 1.02672338, "epoch": 0.1788065534345408, "flos": 21031189459200.0, "grad_norm": 2.6152529641254287, "language_loss": 0.76249814, "learning_rate": 3.692974412815116e-06, "loss": 0.78388429, "num_input_tokens_seen": 64287250, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.68359375, "step": 2974, "time_per_iteration": 2.4068634510040283 }, { "auxiliary_loss_clip": 0.01097236, "auxiliary_loss_mlp": 0.01038095, "balance_loss_clip": 1.01747215, "balance_loss_mlp": 1.02757478, "epoch": 0.17886667668720876, "flos": 23986587248640.0, "grad_norm": 2.789210916353887, "language_loss": 0.74411094, "learning_rate": 3.692773226576315e-06, "loss": 0.76546419, "num_input_tokens_seen": 64307140, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.6953125, "step": 2975, "time_per_iteration": 2.387455940246582 }, { "auxiliary_loss_clip": 0.01095074, "auxiliary_loss_mlp": 0.01033124, "balance_loss_clip": 1.01447964, "balance_loss_mlp": 1.02747369, "epoch": 0.17892679993987676, "flos": 25738284493440.0, "grad_norm": 1.5661051401773733, "language_loss": 0.72881365, "learning_rate": 3.692571979926793e-06, "loss": 0.75009561, "num_input_tokens_seen": 64328760, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.67578125, "step": 2976, "time_per_iteration": 2.446561098098755 }, { "auxiliary_loss_clip": 0.01091909, "auxiliary_loss_mlp": 0.01031705, "balance_loss_clip": 1.0149678, "balance_loss_mlp": 1.02750278, "epoch": 0.17898692319254472, "flos": 25698588410880.0, "grad_norm": 1.5147232149525842, "language_loss": 0.77300251, "learning_rate": 3.69237067287373e-06, "loss": 0.79423863, "num_input_tokens_seen": 64348800, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.64453125, "step": 2977, "time_per_iteration": 3.884591579437256 }, { "auxiliary_loss_clip": 0.01098059, "auxiliary_loss_mlp": 0.01046172, "balance_loss_clip": 1.0268724, "balance_loss_mlp": 1.03055906, "epoch": 0.1790470464452127, "flos": 19316779413120.0, "grad_norm": 2.1145517949177695, "language_loss": 0.79672265, "learning_rate": 3.6921693054243118e-06, "loss": 0.81816506, "num_input_tokens_seen": 64367955, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.67578125, "step": 2978, "time_per_iteration": 2.462244987487793 }, { "auxiliary_loss_clip": 0.01098729, "auxiliary_loss_mlp": 0.01035956, "balance_loss_clip": 1.01608396, "balance_loss_mlp": 1.02767551, "epoch": 0.17910716969788065, "flos": 30042970663680.0, "grad_norm": 1.655474119658755, "language_loss": 0.76386064, "learning_rate": 3.6919678775857235e-06, "loss": 0.78520751, "num_input_tokens_seen": 64389805, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.7109375, "step": 2979, "time_per_iteration": 2.4736757278442383 }, { "auxiliary_loss_clip": 0.01095602, "auxiliary_loss_mlp": 0.01034779, "balance_loss_clip": 1.01668274, "balance_loss_mlp": 1.02867532, "epoch": 0.17916729295054862, "flos": 19426685973120.0, "grad_norm": 1.987467858622668, "language_loss": 0.68844259, "learning_rate": 3.691766389365154e-06, "loss": 0.70974636, "num_input_tokens_seen": 64408220, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.671875, "step": 2980, "time_per_iteration": 5.1556713581085205 }, { "auxiliary_loss_clip": 0.01100295, "auxiliary_loss_mlp": 0.01039066, "balance_loss_clip": 1.01813245, "balance_loss_mlp": 1.02987719, "epoch": 0.17922741620321658, "flos": 14610661896960.0, "grad_norm": 1.717844595830466, "language_loss": 0.70527929, "learning_rate": 3.6915648407697936e-06, "loss": 0.72667289, "num_input_tokens_seen": 64426380, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.703125, "step": 2981, "time_per_iteration": 2.354583501815796 }, { "auxiliary_loss_clip": 0.01099471, "auxiliary_loss_mlp": 0.01046799, "balance_loss_clip": 1.02537692, "balance_loss_mlp": 1.02893233, "epoch": 0.17928753945588458, "flos": 17164349049600.0, "grad_norm": 2.3933114345657147, "language_loss": 0.81727308, "learning_rate": 3.691363231806836e-06, "loss": 0.83873576, "num_input_tokens_seen": 64444355, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.703125, "step": 2982, "time_per_iteration": 3.780759811401367 }, { "auxiliary_loss_clip": 0.01095086, "auxiliary_loss_mlp": 0.01034561, "balance_loss_clip": 1.01578546, "balance_loss_mlp": 1.02700841, "epoch": 0.17934766270855254, "flos": 31394248992000.0, "grad_norm": 1.502508796775173, "language_loss": 0.8268553, "learning_rate": 3.691161562483474e-06, "loss": 0.84815174, "num_input_tokens_seen": 64467800, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.6796875, "step": 2983, "time_per_iteration": 2.4457900524139404 }, { "auxiliary_loss_clip": 0.01097371, "auxiliary_loss_mlp": 0.01040516, "balance_loss_clip": 1.01946414, "balance_loss_mlp": 1.02622414, "epoch": 0.1794077859612205, "flos": 20813121907200.0, "grad_norm": 1.9901995182426722, "language_loss": 0.8515988, "learning_rate": 3.690959832806907e-06, "loss": 0.87297773, "num_input_tokens_seen": 64487230, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.7109375, "step": 2984, "time_per_iteration": 2.4008259773254395 }, { "auxiliary_loss_clip": 0.01096673, "auxiliary_loss_mlp": 0.01039458, "balance_loss_clip": 1.01822722, "balance_loss_mlp": 1.02621925, "epoch": 0.17946790921388847, "flos": 28985172157440.0, "grad_norm": 1.3482576795547023, "language_loss": 0.89483905, "learning_rate": 3.690758042784333e-06, "loss": 0.9162004, "num_input_tokens_seen": 64509165, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.70703125, "step": 2985, "time_per_iteration": 2.4310142993927 }, { "auxiliary_loss_clip": 0.0109346, "auxiliary_loss_mlp": 0.01037514, "balance_loss_clip": 1.01982343, "balance_loss_mlp": 1.02734017, "epoch": 0.17952803246655644, "flos": 20736452828160.0, "grad_norm": 1.9732589629646102, "language_loss": 0.69493186, "learning_rate": 3.690556192422954e-06, "loss": 0.7162416, "num_input_tokens_seen": 64527940, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.66015625, "step": 2986, "time_per_iteration": 2.4057769775390625 }, { "auxiliary_loss_clip": 0.01093622, "auxiliary_loss_mlp": 0.0103883, "balance_loss_clip": 1.01949382, "balance_loss_mlp": 1.02628994, "epoch": 0.1795881557192244, "flos": 28254754270080.0, "grad_norm": 2.1704732837980933, "language_loss": 0.77198172, "learning_rate": 3.6903542817299725e-06, "loss": 0.79330623, "num_input_tokens_seen": 64545230, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.671875, "step": 2987, "time_per_iteration": 2.412198066711426 }, { "auxiliary_loss_clip": 0.01098981, "auxiliary_loss_mlp": 0.01040336, "balance_loss_clip": 1.01951039, "balance_loss_mlp": 1.0278964, "epoch": 0.17964827897189237, "flos": 18551029363200.0, "grad_norm": 1.8687050131293323, "language_loss": 0.77970552, "learning_rate": 3.690152310712595e-06, "loss": 0.80109864, "num_input_tokens_seen": 64563820, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.7109375, "step": 2988, "time_per_iteration": 2.426103353500366 }, { "auxiliary_loss_clip": 0.01027175, "auxiliary_loss_mlp": 0.01007762, "balance_loss_clip": 1.00511575, "balance_loss_mlp": 1.006791, "epoch": 0.17970840222456036, "flos": 58162261392000.0, "grad_norm": 0.7673995150315861, "language_loss": 0.62703419, "learning_rate": 3.6899502793780295e-06, "loss": 0.64738357, "num_input_tokens_seen": 64621315, "router_z_loss_clip": 0.02648926, "router_z_loss_mlp": 0.203125, "step": 2989, "time_per_iteration": 2.9614171981811523 }, { "auxiliary_loss_clip": 0.01096796, "auxiliary_loss_mlp": 0.01032886, "balance_loss_clip": 1.01431358, "balance_loss_mlp": 1.02748251, "epoch": 0.17976852547722832, "flos": 20299828965120.0, "grad_norm": 2.5947417124638914, "language_loss": 0.70792025, "learning_rate": 3.689748187733485e-06, "loss": 0.72921705, "num_input_tokens_seen": 64639885, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.69140625, "step": 2990, "time_per_iteration": 2.399005889892578 }, { "auxiliary_loss_clip": 0.01096707, "auxiliary_loss_mlp": 0.01041279, "balance_loss_clip": 1.02242041, "balance_loss_mlp": 1.02798045, "epoch": 0.1798286487298963, "flos": 39668001632640.0, "grad_norm": 1.794322181293968, "language_loss": 0.68833303, "learning_rate": 3.6895460357861743e-06, "loss": 0.70971286, "num_input_tokens_seen": 64661220, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.6875, "step": 2991, "time_per_iteration": 2.546729803085327 }, { "auxiliary_loss_clip": 0.01095533, "auxiliary_loss_mlp": 0.01036174, "balance_loss_clip": 1.01694548, "balance_loss_mlp": 1.02731848, "epoch": 0.17988877198256426, "flos": 25519134689280.0, "grad_norm": 1.9749019486282824, "language_loss": 0.83044302, "learning_rate": 3.6893438235433117e-06, "loss": 0.85176003, "num_input_tokens_seen": 64682530, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.6796875, "step": 2992, "time_per_iteration": 2.4183197021484375 }, { "auxiliary_loss_clip": 0.01093847, "auxiliary_loss_mlp": 0.01034821, "balance_loss_clip": 1.01723754, "balance_loss_mlp": 1.02816367, "epoch": 0.17994889523523222, "flos": 18806488848000.0, "grad_norm": 2.2885754724046654, "language_loss": 0.81842172, "learning_rate": 3.689141551012114e-06, "loss": 0.83970839, "num_input_tokens_seen": 64701025, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.65625, "step": 2993, "time_per_iteration": 2.3537867069244385 }, { "auxiliary_loss_clip": 0.01094229, "auxiliary_loss_mlp": 0.01031966, "balance_loss_clip": 1.01268983, "balance_loss_mlp": 1.02611601, "epoch": 0.18000901848790019, "flos": 21103424795520.0, "grad_norm": 1.8544353092325712, "language_loss": 0.78179508, "learning_rate": 3.688939218199799e-06, "loss": 0.80305707, "num_input_tokens_seen": 64719570, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.68359375, "step": 2994, "time_per_iteration": 2.401881456375122 }, { "auxiliary_loss_clip": 0.01097756, "auxiliary_loss_mlp": 0.0103543, "balance_loss_clip": 1.01721501, "balance_loss_mlp": 1.02783489, "epoch": 0.18006914174056818, "flos": 19645416840960.0, "grad_norm": 2.2977123948197695, "language_loss": 0.80790877, "learning_rate": 3.6887368251135875e-06, "loss": 0.82924068, "num_input_tokens_seen": 64738110, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.69921875, "step": 2995, "time_per_iteration": 2.355602741241455 }, { "auxiliary_loss_clip": 0.01096986, "auxiliary_loss_mlp": 0.01036714, "balance_loss_clip": 1.01842737, "balance_loss_mlp": 1.02779651, "epoch": 0.18012926499323614, "flos": 19498886398080.0, "grad_norm": 2.004213944979025, "language_loss": 0.84364128, "learning_rate": 3.688534371760703e-06, "loss": 0.86497831, "num_input_tokens_seen": 64756345, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.69140625, "step": 2996, "time_per_iteration": 2.3671207427978516 }, { "auxiliary_loss_clip": 0.01091169, "auxiliary_loss_mlp": 0.01036037, "balance_loss_clip": 1.01834655, "balance_loss_mlp": 1.02571321, "epoch": 0.1801893882459041, "flos": 19463519324160.0, "grad_norm": 1.8246987670289778, "language_loss": 0.88096237, "learning_rate": 3.68833185814837e-06, "loss": 0.90223432, "num_input_tokens_seen": 64776375, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.65625, "step": 2997, "time_per_iteration": 2.428534507751465 }, { "auxiliary_loss_clip": 0.01098237, "auxiliary_loss_mlp": 0.01043539, "balance_loss_clip": 1.02237988, "balance_loss_mlp": 1.0258584, "epoch": 0.18024951149857207, "flos": 26869365676800.0, "grad_norm": 1.749073381435927, "language_loss": 0.85452026, "learning_rate": 3.688129284283816e-06, "loss": 0.875938, "num_input_tokens_seen": 64796210, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.72265625, "step": 2998, "time_per_iteration": 2.436452627182007 }, { "auxiliary_loss_clip": 0.01097172, "auxiliary_loss_mlp": 0.01042816, "balance_loss_clip": 1.02375484, "balance_loss_mlp": 1.02929878, "epoch": 0.18030963475124004, "flos": 30225322028160.0, "grad_norm": 1.898293642111629, "language_loss": 0.84303552, "learning_rate": 3.6879266501742705e-06, "loss": 0.86443543, "num_input_tokens_seen": 64818590, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.6796875, "step": 2999, "time_per_iteration": 2.440263032913208 }, { "auxiliary_loss_clip": 0.01094612, "auxiliary_loss_mlp": 0.01039983, "balance_loss_clip": 1.0202781, "balance_loss_mlp": 1.0264858, "epoch": 0.180369758003908, "flos": 22306462024320.0, "grad_norm": 1.7817234409355363, "language_loss": 0.74977803, "learning_rate": 3.6877239558269642e-06, "loss": 0.77112401, "num_input_tokens_seen": 64838350, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.6796875, "step": 3000, "time_per_iteration": 2.400106191635132 }, { "auxiliary_loss_clip": 0.01094685, "auxiliary_loss_mlp": 0.01045176, "balance_loss_clip": 1.02609098, "balance_loss_mlp": 1.02801108, "epoch": 0.18042988125657597, "flos": 23730918295680.0, "grad_norm": 1.6914341462004998, "language_loss": 0.7138685, "learning_rate": 3.687521201249132e-06, "loss": 0.73526716, "num_input_tokens_seen": 64858065, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.66796875, "step": 3001, "time_per_iteration": 2.4026381969451904 }, { "auxiliary_loss_clip": 0.01097361, "auxiliary_loss_mlp": 0.01037554, "balance_loss_clip": 1.01790857, "balance_loss_mlp": 1.02772808, "epoch": 0.18049000450924396, "flos": 24092548824960.0, "grad_norm": 1.9922244182544901, "language_loss": 0.88416296, "learning_rate": 3.687318386448008e-06, "loss": 0.90551209, "num_input_tokens_seen": 64877305, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.6953125, "step": 3002, "time_per_iteration": 2.4055991172790527 }, { "auxiliary_loss_clip": 0.01094645, "auxiliary_loss_mlp": 0.01039409, "balance_loss_clip": 1.0210861, "balance_loss_mlp": 1.02737689, "epoch": 0.18055012776191193, "flos": 22162096085760.0, "grad_norm": 1.8975920308571603, "language_loss": 0.80576307, "learning_rate": 3.687115511430832e-06, "loss": 0.82710361, "num_input_tokens_seen": 64896955, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.671875, "step": 3003, "time_per_iteration": 2.3824965953826904 }, { "auxiliary_loss_clip": 0.01095178, "auxiliary_loss_mlp": 0.01038835, "balance_loss_clip": 1.01905823, "balance_loss_mlp": 1.02607787, "epoch": 0.1806102510145799, "flos": 28912238593920.0, "grad_norm": 2.2792963212946056, "language_loss": 0.66961324, "learning_rate": 3.6869125762048423e-06, "loss": 0.69095337, "num_input_tokens_seen": 64917080, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.69140625, "step": 3004, "time_per_iteration": 2.422297239303589 }, { "auxiliary_loss_clip": 0.01097773, "auxiliary_loss_mlp": 0.01040511, "balance_loss_clip": 1.02108049, "balance_loss_mlp": 1.02752137, "epoch": 0.18067037426724786, "flos": 19024696045440.0, "grad_norm": 1.6577558114338158, "language_loss": 0.85528255, "learning_rate": 3.6867095807772826e-06, "loss": 0.87666535, "num_input_tokens_seen": 64935215, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.703125, "step": 3005, "time_per_iteration": 2.3702433109283447 }, { "auxiliary_loss_clip": 0.01092354, "auxiliary_loss_mlp": 0.01038303, "balance_loss_clip": 1.01965904, "balance_loss_mlp": 1.02653646, "epoch": 0.18073049751991582, "flos": 27452415248640.0, "grad_norm": 1.545663060047751, "language_loss": 0.8309502, "learning_rate": 3.6865065251553967e-06, "loss": 0.85225677, "num_input_tokens_seen": 64956275, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.65625, "step": 3006, "time_per_iteration": 2.427565574645996 }, { "auxiliary_loss_clip": 0.01092904, "auxiliary_loss_mlp": 0.01038936, "balance_loss_clip": 1.0199697, "balance_loss_mlp": 1.02515292, "epoch": 0.1807906207725838, "flos": 28727827459200.0, "grad_norm": 1.6693494832060916, "language_loss": 0.77110308, "learning_rate": 3.6863034093464307e-06, "loss": 0.79242146, "num_input_tokens_seen": 64979390, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.6796875, "step": 3007, "time_per_iteration": 2.4451706409454346 }, { "auxiliary_loss_clip": 0.0102929, "auxiliary_loss_mlp": 0.01018946, "balance_loss_clip": 1.01641917, "balance_loss_mlp": 1.00968313, "epoch": 0.18085074402525175, "flos": 64462791144960.0, "grad_norm": 0.7976309932500728, "language_loss": 0.56929284, "learning_rate": 3.686100233357634e-06, "loss": 0.5897752, "num_input_tokens_seen": 65043135, "router_z_loss_clip": 0.02526855, "router_z_loss_mlp": 0.19628906, "step": 3008, "time_per_iteration": 3.161513328552246 }, { "auxiliary_loss_clip": 0.01097169, "auxiliary_loss_mlp": 0.01044263, "balance_loss_clip": 1.02424729, "balance_loss_mlp": 1.02932644, "epoch": 0.18091086727791975, "flos": 23475842835840.0, "grad_norm": 1.9043645543070598, "language_loss": 0.67481375, "learning_rate": 3.6858969971962573e-06, "loss": 0.69622803, "num_input_tokens_seen": 65062845, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.6796875, "step": 3009, "time_per_iteration": 2.3931095600128174 }, { "auxiliary_loss_clip": 0.01096304, "auxiliary_loss_mlp": 0.01034655, "balance_loss_clip": 1.01597512, "balance_loss_mlp": 1.02767563, "epoch": 0.1809709905305877, "flos": 24169322638080.0, "grad_norm": 2.562430365773123, "language_loss": 0.75639015, "learning_rate": 3.685693700869553e-06, "loss": 0.77769971, "num_input_tokens_seen": 65082110, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.6875, "step": 3010, "time_per_iteration": 2.4174158573150635 }, { "auxiliary_loss_clip": 0.01089004, "auxiliary_loss_mlp": 0.01034499, "balance_loss_clip": 1.0177027, "balance_loss_mlp": 1.02533138, "epoch": 0.18103111378325568, "flos": 21649885395840.0, "grad_norm": 1.5937217071721066, "language_loss": 0.67342001, "learning_rate": 3.6854903443847772e-06, "loss": 0.69465506, "num_input_tokens_seen": 65101985, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.63671875, "step": 3011, "time_per_iteration": 2.4023966789245605 }, { "auxiliary_loss_clip": 0.0109164, "auxiliary_loss_mlp": 0.01035591, "balance_loss_clip": 1.01705384, "balance_loss_mlp": 1.02585912, "epoch": 0.18109123703592364, "flos": 53684965992960.0, "grad_norm": 1.7738741352484462, "language_loss": 0.71349472, "learning_rate": 3.6852869277491865e-06, "loss": 0.73476702, "num_input_tokens_seen": 65129295, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.66015625, "step": 3012, "time_per_iteration": 2.692625045776367 }, { "auxiliary_loss_clip": 0.01093415, "auxiliary_loss_mlp": 0.01038487, "balance_loss_clip": 1.02000999, "balance_loss_mlp": 1.02846265, "epoch": 0.1811513602885916, "flos": 35844104062080.0, "grad_norm": 2.037454486184128, "language_loss": 0.63142848, "learning_rate": 3.68508345097004e-06, "loss": 0.65274751, "num_input_tokens_seen": 65150625, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.6484375, "step": 3013, "time_per_iteration": 2.505274772644043 }, { "auxiliary_loss_clip": 0.01097457, "auxiliary_loss_mlp": 0.01043572, "balance_loss_clip": 1.02592874, "balance_loss_mlp": 1.02976298, "epoch": 0.18121148354125957, "flos": 23731441966080.0, "grad_norm": 1.6507094366992145, "language_loss": 0.76124537, "learning_rate": 3.6848799140546e-06, "loss": 0.78265566, "num_input_tokens_seen": 65170880, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.6796875, "step": 3014, "time_per_iteration": 2.4147589206695557 }, { "auxiliary_loss_clip": 0.01098058, "auxiliary_loss_mlp": 0.01042203, "balance_loss_clip": 1.0211271, "balance_loss_mlp": 1.02832627, "epoch": 0.18127160679392756, "flos": 28727129232000.0, "grad_norm": 2.1835324373493568, "language_loss": 0.66143107, "learning_rate": 3.6846763170101297e-06, "loss": 0.68283367, "num_input_tokens_seen": 65192530, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.69921875, "step": 3015, "time_per_iteration": 2.45542573928833 }, { "auxiliary_loss_clip": 0.01092742, "auxiliary_loss_mlp": 0.01035633, "balance_loss_clip": 1.01674986, "balance_loss_mlp": 1.02746844, "epoch": 0.18133173004659553, "flos": 20484030631680.0, "grad_norm": 1.6975769583465765, "language_loss": 0.7801252, "learning_rate": 3.684472659843895e-06, "loss": 0.80140895, "num_input_tokens_seen": 65211675, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.6484375, "step": 3016, "time_per_iteration": 2.3770716190338135 }, { "auxiliary_loss_clip": 0.01098174, "auxiliary_loss_mlp": 0.01038127, "balance_loss_clip": 1.0191493, "balance_loss_mlp": 1.02951336, "epoch": 0.1813918532992635, "flos": 22851107233920.0, "grad_norm": 1.6992091223276173, "language_loss": 0.83647573, "learning_rate": 3.6842689425631645e-06, "loss": 0.85783875, "num_input_tokens_seen": 65231185, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.6875, "step": 3017, "time_per_iteration": 3.90226411819458 }, { "auxiliary_loss_clip": 0.01091733, "auxiliary_loss_mlp": 0.01035836, "balance_loss_clip": 1.01820517, "balance_loss_mlp": 1.0266006, "epoch": 0.18145197655193146, "flos": 36063637891200.0, "grad_norm": 4.773021443468642, "language_loss": 0.67354894, "learning_rate": 3.684065165175208e-06, "loss": 0.69482458, "num_input_tokens_seen": 65251645, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.6484375, "step": 3018, "time_per_iteration": 2.5068161487579346 }, { "auxiliary_loss_clip": 0.01095457, "auxiliary_loss_mlp": 0.01034266, "balance_loss_clip": 1.01630139, "balance_loss_mlp": 1.0276463, "epoch": 0.18151209980459942, "flos": 24022827106560.0, "grad_norm": 1.9264538479523363, "language_loss": 0.75988364, "learning_rate": 3.683861327687297e-06, "loss": 0.78118086, "num_input_tokens_seen": 65271125, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.67578125, "step": 3019, "time_per_iteration": 3.7964046001434326 }, { "auxiliary_loss_clip": 0.01097, "auxiliary_loss_mlp": 0.01035523, "balance_loss_clip": 1.01623487, "balance_loss_mlp": 1.02833366, "epoch": 0.1815722230572674, "flos": 23950487036160.0, "grad_norm": 2.1930430614750605, "language_loss": 0.81399328, "learning_rate": 3.683657430106707e-06, "loss": 0.83531857, "num_input_tokens_seen": 65290600, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.6875, "step": 3020, "time_per_iteration": 3.7735393047332764 }, { "auxiliary_loss_clip": 0.0109454, "auxiliary_loss_mlp": 0.01037535, "balance_loss_clip": 1.01861608, "balance_loss_mlp": 1.02751517, "epoch": 0.18163234630993536, "flos": 24385400242560.0, "grad_norm": 1.7764036231911737, "language_loss": 0.77370763, "learning_rate": 3.683453472440714e-06, "loss": 0.79502845, "num_input_tokens_seen": 65311040, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.671875, "step": 3021, "time_per_iteration": 2.4085967540740967 }, { "auxiliary_loss_clip": 0.01090358, "auxiliary_loss_mlp": 0.010367, "balance_loss_clip": 1.01873493, "balance_loss_mlp": 1.02425432, "epoch": 0.18169246956260335, "flos": 24680171784960.0, "grad_norm": 1.6819576893612795, "language_loss": 0.84895861, "learning_rate": 3.6832494546965975e-06, "loss": 0.87022913, "num_input_tokens_seen": 65332115, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.66015625, "step": 3022, "time_per_iteration": 3.787031888961792 }, { "auxiliary_loss_clip": 0.01093793, "auxiliary_loss_mlp": 0.01037292, "balance_loss_clip": 1.01830173, "balance_loss_mlp": 1.02538633, "epoch": 0.1817525928152713, "flos": 24242151467520.0, "grad_norm": 1.8081751746641312, "language_loss": 0.69382024, "learning_rate": 3.6830453768816376e-06, "loss": 0.71513104, "num_input_tokens_seen": 65352210, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.68359375, "step": 3023, "time_per_iteration": 2.3976800441741943 }, { "auxiliary_loss_clip": 0.0109341, "auxiliary_loss_mlp": 0.01036629, "balance_loss_clip": 1.01902175, "balance_loss_mlp": 1.02653098, "epoch": 0.18181271606793928, "flos": 16471148538240.0, "grad_norm": 1.8344109458778812, "language_loss": 0.73903406, "learning_rate": 3.6828412390031174e-06, "loss": 0.76033449, "num_input_tokens_seen": 65370600, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.66796875, "step": 3024, "time_per_iteration": 2.3516170978546143 }, { "auxiliary_loss_clip": 0.01094789, "auxiliary_loss_mlp": 0.0103464, "balance_loss_clip": 1.01550698, "balance_loss_mlp": 1.02728665, "epoch": 0.18187283932060724, "flos": 18580252037760.0, "grad_norm": 1.9590317436915126, "language_loss": 0.8824296, "learning_rate": 3.682637041068322e-06, "loss": 0.90372384, "num_input_tokens_seen": 65387270, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.67578125, "step": 3025, "time_per_iteration": 2.3552322387695312 }, { "auxiliary_loss_clip": 0.01093535, "auxiliary_loss_mlp": 0.01035689, "balance_loss_clip": 1.01683044, "balance_loss_mlp": 1.02792573, "epoch": 0.1819329625732752, "flos": 20265788522880.0, "grad_norm": 1.6937359134732468, "language_loss": 0.78706336, "learning_rate": 3.6824327830845387e-06, "loss": 0.80835557, "num_input_tokens_seen": 65406550, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.65625, "step": 3026, "time_per_iteration": 2.3805599212646484 }, { "auxiliary_loss_clip": 0.01095746, "auxiliary_loss_mlp": 0.01039595, "balance_loss_clip": 1.02073646, "balance_loss_mlp": 1.02767563, "epoch": 0.18199308582594317, "flos": 25914177256320.0, "grad_norm": 1.6651142140429498, "language_loss": 0.75892401, "learning_rate": 3.6822284650590576e-06, "loss": 0.78027743, "num_input_tokens_seen": 65425955, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.6796875, "step": 3027, "time_per_iteration": 2.4328739643096924 }, { "auxiliary_loss_clip": 0.01097257, "auxiliary_loss_mlp": 0.01039279, "balance_loss_clip": 1.01903725, "balance_loss_mlp": 1.02633667, "epoch": 0.18205320907861114, "flos": 15376621415040.0, "grad_norm": 1.9379751917848766, "language_loss": 0.85705507, "learning_rate": 3.68202408699917e-06, "loss": 0.87842047, "num_input_tokens_seen": 65442820, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.7109375, "step": 3028, "time_per_iteration": 2.361747980117798 }, { "auxiliary_loss_clip": 0.01092009, "auxiliary_loss_mlp": 0.01036923, "balance_loss_clip": 1.01812363, "balance_loss_mlp": 1.02600992, "epoch": 0.18211333233127913, "flos": 25623280874880.0, "grad_norm": 1.9534451950594895, "language_loss": 0.82559109, "learning_rate": 3.6818196489121683e-06, "loss": 0.84688038, "num_input_tokens_seen": 65461825, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.66015625, "step": 3029, "time_per_iteration": 2.4177732467651367 }, { "auxiliary_loss_clip": 0.010948, "auxiliary_loss_mlp": 0.01039609, "balance_loss_clip": 1.02011859, "balance_loss_mlp": 1.02715087, "epoch": 0.1821734555839471, "flos": 14975120246400.0, "grad_norm": 1.9630182510346148, "language_loss": 0.77569473, "learning_rate": 3.68161515080535e-06, "loss": 0.79703879, "num_input_tokens_seen": 65479480, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.6796875, "step": 3030, "time_per_iteration": 2.346942663192749 }, { "auxiliary_loss_clip": 0.01093451, "auxiliary_loss_mlp": 0.01034323, "balance_loss_clip": 1.01573825, "balance_loss_mlp": 1.02540433, "epoch": 0.18223357883661506, "flos": 20192959693440.0, "grad_norm": 1.9172145309317545, "language_loss": 0.84994686, "learning_rate": 3.681410592686013e-06, "loss": 0.87122458, "num_input_tokens_seen": 65497775, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.6796875, "step": 3031, "time_per_iteration": 2.3773000240325928 }, { "auxiliary_loss_clip": 0.01094263, "auxiliary_loss_mlp": 0.01035067, "balance_loss_clip": 1.01641083, "balance_loss_mlp": 1.02630711, "epoch": 0.18229370208928303, "flos": 15231068490240.0, "grad_norm": 2.3139330996384486, "language_loss": 0.80105782, "learning_rate": 3.681205974561457e-06, "loss": 0.82235116, "num_input_tokens_seen": 65516505, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.6796875, "step": 3032, "time_per_iteration": 2.3608875274658203 }, { "auxiliary_loss_clip": 0.01099095, "auxiliary_loss_mlp": 0.01039615, "balance_loss_clip": 1.02026689, "balance_loss_mlp": 1.0278728, "epoch": 0.182353825341951, "flos": 23839393489920.0, "grad_norm": 2.2208230265790116, "language_loss": 0.81210154, "learning_rate": 3.6810012964389846e-06, "loss": 0.83348858, "num_input_tokens_seen": 65536160, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.7109375, "step": 3033, "time_per_iteration": 2.4141690731048584 }, { "auxiliary_loss_clip": 0.0102659, "auxiliary_loss_mlp": 0.01005783, "balance_loss_clip": 1.00319636, "balance_loss_mlp": 1.00633883, "epoch": 0.18241394859461896, "flos": 61188114038400.0, "grad_norm": 0.8954202485514626, "language_loss": 0.63418603, "learning_rate": 3.680796558325899e-06, "loss": 0.65450966, "num_input_tokens_seen": 65589375, "router_z_loss_clip": 0.02587891, "router_z_loss_mlp": 0.20214844, "step": 3034, "time_per_iteration": 2.9214541912078857 }, { "auxiliary_loss_clip": 0.01093738, "auxiliary_loss_mlp": 0.01035767, "balance_loss_clip": 1.01731384, "balance_loss_mlp": 1.02661943, "epoch": 0.18247407184728695, "flos": 18470904059520.0, "grad_norm": 1.9366467761323554, "language_loss": 0.79605818, "learning_rate": 3.6805917602295084e-06, "loss": 0.81735319, "num_input_tokens_seen": 65606720, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.671875, "step": 3035, "time_per_iteration": 2.369335651397705 }, { "auxiliary_loss_clip": 0.01090274, "auxiliary_loss_mlp": 0.01031759, "balance_loss_clip": 1.01418781, "balance_loss_mlp": 1.02530837, "epoch": 0.18253419509995492, "flos": 21794216423040.0, "grad_norm": 1.7195695900240333, "language_loss": 0.84461898, "learning_rate": 3.680386902157121e-06, "loss": 0.8658393, "num_input_tokens_seen": 65625495, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.6484375, "step": 3036, "time_per_iteration": 2.401374101638794 }, { "auxiliary_loss_clip": 0.01092218, "auxiliary_loss_mlp": 0.010331, "balance_loss_clip": 1.01519525, "balance_loss_mlp": 1.02690077, "epoch": 0.18259431835262288, "flos": 20148934602240.0, "grad_norm": 2.0379480875904177, "language_loss": 0.79803252, "learning_rate": 3.680181984116047e-06, "loss": 0.81928569, "num_input_tokens_seen": 65643515, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.65234375, "step": 3037, "time_per_iteration": 2.3707478046417236 }, { "auxiliary_loss_clip": 0.01098544, "auxiliary_loss_mlp": 0.01038427, "balance_loss_clip": 1.01751804, "balance_loss_mlp": 1.02916551, "epoch": 0.18265444160529085, "flos": 16980740876160.0, "grad_norm": 4.894582151194986, "language_loss": 0.79521585, "learning_rate": 3.6799770061136e-06, "loss": 0.81658554, "num_input_tokens_seen": 65658155, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.6953125, "step": 3038, "time_per_iteration": 2.3390767574310303 }, { "auxiliary_loss_clip": 0.01093295, "auxiliary_loss_mlp": 0.01034576, "balance_loss_clip": 1.01652741, "balance_loss_mlp": 1.02583265, "epoch": 0.1827145648579588, "flos": 34421812295040.0, "grad_norm": 2.248150297807195, "language_loss": 0.67581129, "learning_rate": 3.6797719681570953e-06, "loss": 0.69708991, "num_input_tokens_seen": 65679310, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.671875, "step": 3039, "time_per_iteration": 2.5091495513916016 }, { "auxiliary_loss_clip": 0.01094548, "auxiliary_loss_mlp": 0.01036112, "balance_loss_clip": 1.01676488, "balance_loss_mlp": 1.02687049, "epoch": 0.18277468811062678, "flos": 53285035835520.0, "grad_norm": 2.444654234344379, "language_loss": 0.73460305, "learning_rate": 3.6795668702538505e-06, "loss": 0.75590956, "num_input_tokens_seen": 65705235, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.6796875, "step": 3040, "time_per_iteration": 2.6638429164886475 }, { "auxiliary_loss_clip": 0.01093964, "auxiliary_loss_mlp": 0.01032173, "balance_loss_clip": 1.01376712, "balance_loss_mlp": 1.02748919, "epoch": 0.18283481136329474, "flos": 31649289540480.0, "grad_norm": 2.0313259677411803, "language_loss": 0.60360682, "learning_rate": 3.6793617124111836e-06, "loss": 0.62486821, "num_input_tokens_seen": 65727575, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.66796875, "step": 3041, "time_per_iteration": 2.4727203845977783 }, { "auxiliary_loss_clip": 0.01095961, "auxiliary_loss_mlp": 0.01041796, "balance_loss_clip": 1.02248418, "balance_loss_mlp": 1.02796614, "epoch": 0.18289493461596273, "flos": 53135782306560.0, "grad_norm": 1.6755375784789484, "language_loss": 0.60253775, "learning_rate": 3.6791564946364176e-06, "loss": 0.62391531, "num_input_tokens_seen": 65751370, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.6796875, "step": 3042, "time_per_iteration": 2.6615445613861084 }, { "auxiliary_loss_clip": 0.01093501, "auxiliary_loss_mlp": 0.01032486, "balance_loss_clip": 1.01429451, "balance_loss_mlp": 1.02835739, "epoch": 0.1829550578686307, "flos": 25588297825920.0, "grad_norm": 1.6013928454406494, "language_loss": 0.87596387, "learning_rate": 3.678951216936875e-06, "loss": 0.89722371, "num_input_tokens_seen": 65771040, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.65234375, "step": 3043, "time_per_iteration": 2.425600051879883 }, { "auxiliary_loss_clip": 0.01096452, "auxiliary_loss_mlp": 0.01038411, "balance_loss_clip": 1.01756108, "balance_loss_mlp": 1.02765286, "epoch": 0.18301518112129866, "flos": 22600325871360.0, "grad_norm": 2.1711627912841824, "language_loss": 0.70740992, "learning_rate": 3.6787458793198825e-06, "loss": 0.72875857, "num_input_tokens_seen": 65789345, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.6875, "step": 3044, "time_per_iteration": 2.37648606300354 }, { "auxiliary_loss_clip": 0.01098498, "auxiliary_loss_mlp": 0.01040058, "balance_loss_clip": 1.01912498, "balance_loss_mlp": 1.02705944, "epoch": 0.18307530437396663, "flos": 34019403431040.0, "grad_norm": 2.1079227102188396, "language_loss": 0.64306909, "learning_rate": 3.678540481792768e-06, "loss": 0.66445458, "num_input_tokens_seen": 65810990, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.71484375, "step": 3045, "time_per_iteration": 2.4975714683532715 }, { "auxiliary_loss_clip": 0.01092421, "auxiliary_loss_mlp": 0.01037968, "balance_loss_clip": 1.01928782, "balance_loss_mlp": 1.02710378, "epoch": 0.1831354276266346, "flos": 21278933533440.0, "grad_norm": 2.2035152419714414, "language_loss": 0.79463446, "learning_rate": 3.6783350243628613e-06, "loss": 0.81593835, "num_input_tokens_seen": 65827230, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.65625, "step": 3046, "time_per_iteration": 2.3456878662109375 }, { "auxiliary_loss_clip": 0.01093784, "auxiliary_loss_mlp": 0.01036512, "balance_loss_clip": 1.01663971, "balance_loss_mlp": 1.02519798, "epoch": 0.18319555087930256, "flos": 21031887686400.0, "grad_norm": 3.27295768454744, "language_loss": 0.78758115, "learning_rate": 3.678129507037495e-06, "loss": 0.80888414, "num_input_tokens_seen": 65845900, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.6875, "step": 3047, "time_per_iteration": 2.380859136581421 }, { "auxiliary_loss_clip": 0.01095002, "auxiliary_loss_mlp": 0.01033843, "balance_loss_clip": 1.01493669, "balance_loss_mlp": 1.02816248, "epoch": 0.18325567413197055, "flos": 34381627453440.0, "grad_norm": 1.5145269717417007, "language_loss": 0.80488312, "learning_rate": 3.6779239298240032e-06, "loss": 0.82617152, "num_input_tokens_seen": 65868730, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.66796875, "step": 3048, "time_per_iteration": 2.49118971824646 }, { "auxiliary_loss_clip": 0.01096412, "auxiliary_loss_mlp": 0.01041874, "balance_loss_clip": 1.02191842, "balance_loss_mlp": 1.02703547, "epoch": 0.18331579738463852, "flos": 20557418042880.0, "grad_norm": 2.404031913488755, "language_loss": 0.8656354, "learning_rate": 3.6777182927297225e-06, "loss": 0.88701832, "num_input_tokens_seen": 65888420, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.69140625, "step": 3049, "time_per_iteration": 2.37909197807312 }, { "auxiliary_loss_clip": 0.0110299, "auxiliary_loss_mlp": 0.01040136, "balance_loss_clip": 1.01970398, "balance_loss_mlp": 1.02914858, "epoch": 0.18337592063730648, "flos": 19606907744640.0, "grad_norm": 2.3591333444806923, "language_loss": 0.76766431, "learning_rate": 3.6775125957619913e-06, "loss": 0.78909552, "num_input_tokens_seen": 65905840, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.73828125, "step": 3050, "time_per_iteration": 2.3622822761535645 }, { "auxiliary_loss_clip": 0.01091689, "auxiliary_loss_mlp": 0.01032873, "balance_loss_clip": 1.01391912, "balance_loss_mlp": 1.02520013, "epoch": 0.18343604388997445, "flos": 20849815612800.0, "grad_norm": 2.0644922428537096, "language_loss": 0.99320161, "learning_rate": 3.6773068389281507e-06, "loss": 1.01444721, "num_input_tokens_seen": 65922845, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.6640625, "step": 3051, "time_per_iteration": 2.3741469383239746 }, { "auxiliary_loss_clip": 0.01093216, "auxiliary_loss_mlp": 0.01036161, "balance_loss_clip": 1.01641965, "balance_loss_mlp": 1.02763176, "epoch": 0.1834961671426424, "flos": 24393080741760.0, "grad_norm": 2.272240720087646, "language_loss": 0.86265355, "learning_rate": 3.6771010222355434e-06, "loss": 0.88394737, "num_input_tokens_seen": 65945555, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.65625, "step": 3052, "time_per_iteration": 2.44637131690979 }, { "auxiliary_loss_clip": 0.01093433, "auxiliary_loss_mlp": 0.010358, "balance_loss_clip": 1.01694083, "balance_loss_mlp": 1.02554989, "epoch": 0.18355629039531038, "flos": 21250548731520.0, "grad_norm": 2.0104145983990582, "language_loss": 0.73043442, "learning_rate": 3.6768951456915147e-06, "loss": 0.75172675, "num_input_tokens_seen": 65963965, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.6796875, "step": 3053, "time_per_iteration": 2.391200542449951 }, { "auxiliary_loss_clip": 0.01098646, "auxiliary_loss_mlp": 0.01038256, "balance_loss_clip": 1.01797879, "balance_loss_mlp": 1.02836823, "epoch": 0.18361641364797834, "flos": 28655277920640.0, "grad_norm": 1.8701847575316863, "language_loss": 0.61304927, "learning_rate": 3.6766892093034123e-06, "loss": 0.63441837, "num_input_tokens_seen": 65985965, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.703125, "step": 3054, "time_per_iteration": 2.433223247528076 }, { "auxiliary_loss_clip": 0.01095956, "auxiliary_loss_mlp": 0.01035546, "balance_loss_clip": 1.01764071, "balance_loss_mlp": 1.02743411, "epoch": 0.18367653690064634, "flos": 20917896497280.0, "grad_norm": 1.9018163121145335, "language_loss": 0.78297484, "learning_rate": 3.6764832130785846e-06, "loss": 0.80428982, "num_input_tokens_seen": 66005645, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.6875, "step": 3055, "time_per_iteration": 2.4101922512054443 }, { "auxiliary_loss_clip": 0.01093526, "auxiliary_loss_mlp": 0.0103933, "balance_loss_clip": 1.0211978, "balance_loss_mlp": 1.02704954, "epoch": 0.1837366601533143, "flos": 28764381519360.0, "grad_norm": 14.423993688140268, "language_loss": 0.70290178, "learning_rate": 3.6762771570243834e-06, "loss": 0.72423035, "num_input_tokens_seen": 66025675, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.6640625, "step": 3056, "time_per_iteration": 3.8228213787078857 }, { "auxiliary_loss_clip": 0.01095429, "auxiliary_loss_mlp": 0.01036901, "balance_loss_clip": 1.01788759, "balance_loss_mlp": 1.02727616, "epoch": 0.18379678340598227, "flos": 21250374174720.0, "grad_norm": 1.7355973902289035, "language_loss": 0.80511397, "learning_rate": 3.6760710411481623e-06, "loss": 0.82643723, "num_input_tokens_seen": 66046125, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.6796875, "step": 3057, "time_per_iteration": 2.4140725135803223 }, { "auxiliary_loss_clip": 0.01100634, "auxiliary_loss_mlp": 0.01040652, "balance_loss_clip": 1.0181576, "balance_loss_mlp": 1.0272398, "epoch": 0.18385690665865023, "flos": 20448558823680.0, "grad_norm": 2.0946795049792666, "language_loss": 0.82550985, "learning_rate": 3.675864865457277e-06, "loss": 0.84692276, "num_input_tokens_seen": 66064375, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.734375, "step": 3058, "time_per_iteration": 2.379794120788574 }, { "auxiliary_loss_clip": 0.01096393, "auxiliary_loss_mlp": 0.01040085, "balance_loss_clip": 1.0208447, "balance_loss_mlp": 1.02711689, "epoch": 0.1839170299113182, "flos": 26139366725760.0, "grad_norm": 2.0980446596863476, "language_loss": 0.85711503, "learning_rate": 3.675658629959086e-06, "loss": 0.87847984, "num_input_tokens_seen": 66084590, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.69140625, "step": 3059, "time_per_iteration": 3.891240119934082 }, { "auxiliary_loss_clip": 0.01093734, "auxiliary_loss_mlp": 0.01036044, "balance_loss_clip": 1.01762605, "balance_loss_mlp": 1.02620101, "epoch": 0.18397715316398616, "flos": 31756717393920.0, "grad_norm": 1.6536118193911227, "language_loss": 0.72956884, "learning_rate": 3.6754523346609486e-06, "loss": 0.75086659, "num_input_tokens_seen": 66107105, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.671875, "step": 3060, "time_per_iteration": 2.4973304271698 }, { "auxiliary_loss_clip": 0.01097536, "auxiliary_loss_mlp": 0.01041634, "balance_loss_clip": 1.02152312, "balance_loss_mlp": 1.02810681, "epoch": 0.18403727641665413, "flos": 24610729357440.0, "grad_norm": 1.7066628802824622, "language_loss": 0.72872066, "learning_rate": 3.675245979570227e-06, "loss": 0.75011235, "num_input_tokens_seen": 66129295, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.6953125, "step": 3061, "time_per_iteration": 3.771973133087158 }, { "auxiliary_loss_clip": 0.01096596, "auxiliary_loss_mlp": 0.01043026, "balance_loss_clip": 1.02271307, "balance_loss_mlp": 1.02863383, "epoch": 0.18409739966932212, "flos": 23438800016640.0, "grad_norm": 1.8269389315323057, "language_loss": 0.81693745, "learning_rate": 3.6750395646942857e-06, "loss": 0.83833361, "num_input_tokens_seen": 66146910, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.6796875, "step": 3062, "time_per_iteration": 2.428438425064087 }, { "auxiliary_loss_clip": 0.01100455, "auxiliary_loss_mlp": 0.01040849, "balance_loss_clip": 1.02059579, "balance_loss_mlp": 1.02837312, "epoch": 0.18415752292199009, "flos": 21871025147520.0, "grad_norm": 2.02936284940346, "language_loss": 0.73024154, "learning_rate": 3.674833090040491e-06, "loss": 0.75165462, "num_input_tokens_seen": 66165370, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.71875, "step": 3063, "time_per_iteration": 2.3711354732513428 }, { "auxiliary_loss_clip": 0.01092559, "auxiliary_loss_mlp": 0.01036058, "balance_loss_clip": 1.0182246, "balance_loss_mlp": 1.0251112, "epoch": 0.18421764617465805, "flos": 25409507420160.0, "grad_norm": 1.735898876993406, "language_loss": 0.65679663, "learning_rate": 3.6746265556162116e-06, "loss": 0.67808282, "num_input_tokens_seen": 66186210, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.67578125, "step": 3064, "time_per_iteration": 2.438000440597534 }, { "auxiliary_loss_clip": 0.01095055, "auxiliary_loss_mlp": 0.01036554, "balance_loss_clip": 1.01711094, "balance_loss_mlp": 1.02754521, "epoch": 0.18427776942732602, "flos": 27196920852480.0, "grad_norm": 2.484580111970709, "language_loss": 0.69019604, "learning_rate": 3.6744199614288174e-06, "loss": 0.71151215, "num_input_tokens_seen": 66204800, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.671875, "step": 3065, "time_per_iteration": 2.418018102645874 }, { "auxiliary_loss_clip": 0.01098874, "auxiliary_loss_mlp": 0.01041382, "balance_loss_clip": 1.01999593, "balance_loss_mlp": 1.02820754, "epoch": 0.18433789267999398, "flos": 27851193331200.0, "grad_norm": 2.250389640607876, "language_loss": 0.72799128, "learning_rate": 3.6742133074856828e-06, "loss": 0.74939388, "num_input_tokens_seen": 66222195, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.70703125, "step": 3066, "time_per_iteration": 2.4412014484405518 }, { "auxiliary_loss_clip": 0.01094326, "auxiliary_loss_mlp": 0.01037958, "balance_loss_clip": 1.01918304, "balance_loss_mlp": 1.02577353, "epoch": 0.18439801593266195, "flos": 17856013461120.0, "grad_norm": 2.4021419349828457, "language_loss": 0.81847805, "learning_rate": 3.6740065937941815e-06, "loss": 0.83980089, "num_input_tokens_seen": 66239505, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.6875, "step": 3067, "time_per_iteration": 2.353297233581543 }, { "auxiliary_loss_clip": 0.01027585, "auxiliary_loss_mlp": 0.01006485, "balance_loss_clip": 1.00370753, "balance_loss_mlp": 1.00713754, "epoch": 0.18445813918532994, "flos": 56386403619840.0, "grad_norm": 0.9837619196764028, "language_loss": 0.5968374, "learning_rate": 3.673799820361691e-06, "loss": 0.61717808, "num_input_tokens_seen": 66295695, "router_z_loss_clip": 0.02783203, "router_z_loss_mlp": 0.20507812, "step": 3068, "time_per_iteration": 2.925403594970703 }, { "auxiliary_loss_clip": 0.01094959, "auxiliary_loss_mlp": 0.01035708, "balance_loss_clip": 1.01705205, "balance_loss_mlp": 1.0279814, "epoch": 0.1845182624379979, "flos": 20956196125440.0, "grad_norm": 1.7097395201758374, "language_loss": 0.76456642, "learning_rate": 3.67359298719559e-06, "loss": 0.78587306, "num_input_tokens_seen": 66315315, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.671875, "step": 3069, "time_per_iteration": 2.3808977603912354 }, { "auxiliary_loss_clip": 0.01095065, "auxiliary_loss_mlp": 0.01035625, "balance_loss_clip": 1.01588392, "balance_loss_mlp": 1.02618957, "epoch": 0.18457838569066587, "flos": 20484135365760.0, "grad_norm": 1.818544207956705, "language_loss": 0.84722435, "learning_rate": 3.6733860943032607e-06, "loss": 0.86853123, "num_input_tokens_seen": 66333675, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.6875, "step": 3070, "time_per_iteration": 2.394941568374634 }, { "auxiliary_loss_clip": 0.01095165, "auxiliary_loss_mlp": 0.01036996, "balance_loss_clip": 1.0171473, "balance_loss_mlp": 1.02638769, "epoch": 0.18463850894333383, "flos": 25008844124160.0, "grad_norm": 1.9230594787216562, "language_loss": 0.77398825, "learning_rate": 3.6731791416920863e-06, "loss": 0.79530984, "num_input_tokens_seen": 66354075, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.6875, "step": 3071, "time_per_iteration": 2.402108907699585 }, { "auxiliary_loss_clip": 0.01099213, "auxiliary_loss_mlp": 0.01046013, "balance_loss_clip": 1.02627218, "balance_loss_mlp": 1.02819109, "epoch": 0.1846986321960018, "flos": 16799681232000.0, "grad_norm": 2.5374344837361407, "language_loss": 0.77136636, "learning_rate": 3.6729721293694523e-06, "loss": 0.79281867, "num_input_tokens_seen": 66372520, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.7109375, "step": 3072, "time_per_iteration": 2.424731969833374 }, { "auxiliary_loss_clip": 0.01097982, "auxiliary_loss_mlp": 0.0103529, "balance_loss_clip": 1.01547766, "balance_loss_mlp": 1.02694249, "epoch": 0.18475875544866976, "flos": 20813261552640.0, "grad_norm": 1.872650812020611, "language_loss": 0.86287987, "learning_rate": 3.6727650573427464e-06, "loss": 0.88421261, "num_input_tokens_seen": 66390745, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.7109375, "step": 3073, "time_per_iteration": 2.382587432861328 }, { "auxiliary_loss_clip": 0.01097807, "auxiliary_loss_mlp": 0.01038702, "balance_loss_clip": 1.02026057, "balance_loss_mlp": 1.0290978, "epoch": 0.18481887870133773, "flos": 22600325871360.0, "grad_norm": 2.581206154861645, "language_loss": 0.91659003, "learning_rate": 3.672557925619358e-06, "loss": 0.93795508, "num_input_tokens_seen": 66410525, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.6875, "step": 3074, "time_per_iteration": 2.416276216506958 }, { "auxiliary_loss_clip": 0.01095132, "auxiliary_loss_mlp": 0.0104095, "balance_loss_clip": 1.0198257, "balance_loss_mlp": 1.02679682, "epoch": 0.18487900195400572, "flos": 29457582030720.0, "grad_norm": 1.9061680747010519, "language_loss": 0.64877582, "learning_rate": 3.67235073420668e-06, "loss": 0.67013657, "num_input_tokens_seen": 66432535, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.68359375, "step": 3075, "time_per_iteration": 2.4442203044891357 }, { "auxiliary_loss_clip": 0.01096984, "auxiliary_loss_mlp": 0.01037603, "balance_loss_clip": 1.01770711, "balance_loss_mlp": 1.02914739, "epoch": 0.1849391252066737, "flos": 20627803077120.0, "grad_norm": 1.8140366071750742, "language_loss": 0.72486526, "learning_rate": 3.672143483112106e-06, "loss": 0.74621117, "num_input_tokens_seen": 66450620, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.6796875, "step": 3076, "time_per_iteration": 2.3927721977233887 }, { "auxiliary_loss_clip": 0.0109696, "auxiliary_loss_mlp": 0.01038754, "balance_loss_clip": 1.01946545, "balance_loss_mlp": 1.02690089, "epoch": 0.18499924845934165, "flos": 14427682128000.0, "grad_norm": 2.272661651313579, "language_loss": 0.81143332, "learning_rate": 3.6719361723430325e-06, "loss": 0.83279043, "num_input_tokens_seen": 66467865, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.703125, "step": 3077, "time_per_iteration": 2.3459293842315674 }, { "auxiliary_loss_clip": 0.01093051, "auxiliary_loss_mlp": 0.01037479, "balance_loss_clip": 1.01968122, "balance_loss_mlp": 1.02585387, "epoch": 0.18505937171200962, "flos": 23726659109760.0, "grad_norm": 1.8577127837084841, "language_loss": 0.78537548, "learning_rate": 3.671728801906857e-06, "loss": 0.8066808, "num_input_tokens_seen": 66486245, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.671875, "step": 3078, "time_per_iteration": 2.426886558532715 }, { "auxiliary_loss_clip": 0.01095366, "auxiliary_loss_mlp": 0.01039953, "balance_loss_clip": 1.02030742, "balance_loss_mlp": 1.02709413, "epoch": 0.18511949496467758, "flos": 25956317134080.0, "grad_norm": 1.8974727382618128, "language_loss": 0.77608848, "learning_rate": 3.6715213718109816e-06, "loss": 0.79744172, "num_input_tokens_seen": 66506510, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.68359375, "step": 3079, "time_per_iteration": 2.4180920124053955 }, { "auxiliary_loss_clip": 0.01090707, "auxiliary_loss_mlp": 0.01038363, "balance_loss_clip": 1.02035069, "balance_loss_mlp": 1.02345252, "epoch": 0.18517961821734555, "flos": 42411895205760.0, "grad_norm": 1.751473176273842, "language_loss": 0.81666404, "learning_rate": 3.671313882062808e-06, "loss": 0.8379547, "num_input_tokens_seen": 66530960, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.671875, "step": 3080, "time_per_iteration": 2.6055829524993896 }, { "auxiliary_loss_clip": 0.01096193, "auxiliary_loss_mlp": 0.01039956, "balance_loss_clip": 1.01905835, "balance_loss_mlp": 1.02513731, "epoch": 0.18523974147001354, "flos": 24096423985920.0, "grad_norm": 1.8256720833864581, "language_loss": 0.73711753, "learning_rate": 3.6711063326697405e-06, "loss": 0.758479, "num_input_tokens_seen": 66550275, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.7109375, "step": 3081, "time_per_iteration": 2.4011452198028564 }, { "auxiliary_loss_clip": 0.01097376, "auxiliary_loss_mlp": 0.01039001, "balance_loss_clip": 1.01956987, "balance_loss_mlp": 1.0292356, "epoch": 0.1852998647226815, "flos": 27374210069760.0, "grad_norm": 2.017635067332807, "language_loss": 0.71629858, "learning_rate": 3.6708987236391867e-06, "loss": 0.73766237, "num_input_tokens_seen": 66569040, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.6796875, "step": 3082, "time_per_iteration": 2.4442226886749268 }, { "auxiliary_loss_clip": 0.01095924, "auxiliary_loss_mlp": 0.01036522, "balance_loss_clip": 1.01576734, "balance_loss_mlp": 1.0284586, "epoch": 0.18535998797534947, "flos": 18331774824960.0, "grad_norm": 2.59461557446974, "language_loss": 0.69121969, "learning_rate": 3.6706910549785562e-06, "loss": 0.71254414, "num_input_tokens_seen": 66587775, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.67578125, "step": 3083, "time_per_iteration": 2.348402261734009 }, { "auxiliary_loss_clip": 0.01095571, "auxiliary_loss_mlp": 0.01035693, "balance_loss_clip": 1.01739383, "balance_loss_mlp": 1.02865887, "epoch": 0.18542011122801744, "flos": 37844522899200.0, "grad_norm": 2.0029760019679537, "language_loss": 0.68881965, "learning_rate": 3.670483326695259e-06, "loss": 0.71013224, "num_input_tokens_seen": 66610800, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.671875, "step": 3084, "time_per_iteration": 2.5439836978912354 }, { "auxiliary_loss_clip": 0.01093441, "auxiliary_loss_mlp": 0.01035632, "balance_loss_clip": 1.01701152, "balance_loss_mlp": 1.02686095, "epoch": 0.1854802344806854, "flos": 25185120912000.0, "grad_norm": 1.7511023900322003, "language_loss": 0.77998507, "learning_rate": 3.6702755387967097e-06, "loss": 0.80127585, "num_input_tokens_seen": 66630960, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.6640625, "step": 3085, "time_per_iteration": 2.4212591648101807 }, { "auxiliary_loss_clip": 0.01093055, "auxiliary_loss_mlp": 0.01037337, "balance_loss_clip": 1.01842999, "balance_loss_mlp": 1.02593207, "epoch": 0.18554035773335337, "flos": 26683662821760.0, "grad_norm": 2.121942337652293, "language_loss": 0.73581004, "learning_rate": 3.6700676912903214e-06, "loss": 0.75711393, "num_input_tokens_seen": 66650585, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.671875, "step": 3086, "time_per_iteration": 2.4404194355010986 }, { "auxiliary_loss_clip": 0.010925, "auxiliary_loss_mlp": 0.01038782, "balance_loss_clip": 1.01865935, "balance_loss_mlp": 1.02658355, "epoch": 0.18560048098602133, "flos": 22345774081920.0, "grad_norm": 2.215072680838077, "language_loss": 0.69519728, "learning_rate": 3.6698597841835144e-06, "loss": 0.71651012, "num_input_tokens_seen": 66670045, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.65625, "step": 3087, "time_per_iteration": 2.3740296363830566 }, { "auxiliary_loss_clip": 0.01092804, "auxiliary_loss_mlp": 0.0103977, "balance_loss_clip": 1.01925397, "balance_loss_mlp": 1.02611017, "epoch": 0.18566060423868933, "flos": 17747573178240.0, "grad_norm": 2.3717575823996118, "language_loss": 0.73237813, "learning_rate": 3.6696518174837064e-06, "loss": 0.75370395, "num_input_tokens_seen": 66688790, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.66796875, "step": 3088, "time_per_iteration": 2.3804128170013428 }, { "auxiliary_loss_clip": 0.01092503, "auxiliary_loss_mlp": 0.01036027, "balance_loss_clip": 1.01862299, "balance_loss_mlp": 1.0264163, "epoch": 0.1857207274913573, "flos": 24676226801280.0, "grad_norm": 1.8183974767075333, "language_loss": 0.91748768, "learning_rate": 3.6694437911983197e-06, "loss": 0.93877304, "num_input_tokens_seen": 66708090, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.66015625, "step": 3089, "time_per_iteration": 2.4014928340911865 }, { "auxiliary_loss_clip": 0.01090311, "auxiliary_loss_mlp": 0.01034988, "balance_loss_clip": 1.01645148, "balance_loss_mlp": 1.02585053, "epoch": 0.18578085074402526, "flos": 28146558366720.0, "grad_norm": 4.054582797978431, "language_loss": 0.57891083, "learning_rate": 3.669235705334779e-06, "loss": 0.60016382, "num_input_tokens_seen": 66727320, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.64453125, "step": 3090, "time_per_iteration": 2.441551446914673 }, { "auxiliary_loss_clip": 0.01089237, "auxiliary_loss_mlp": 0.01036836, "balance_loss_clip": 1.01866841, "balance_loss_mlp": 1.02500856, "epoch": 0.18584097399669322, "flos": 23950731415680.0, "grad_norm": 1.991318415116826, "language_loss": 0.81947285, "learning_rate": 3.669027559900509e-06, "loss": 0.84073359, "num_input_tokens_seen": 66747505, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.640625, "step": 3091, "time_per_iteration": 2.428163528442383 }, { "auxiliary_loss_clip": 0.01094214, "auxiliary_loss_mlp": 0.01042256, "balance_loss_clip": 1.02330172, "balance_loss_mlp": 1.02634609, "epoch": 0.18590109724936119, "flos": 17200728552960.0, "grad_norm": 5.040958102622603, "language_loss": 0.84161019, "learning_rate": 3.6688193549029397e-06, "loss": 0.86297488, "num_input_tokens_seen": 66766425, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.6796875, "step": 3092, "time_per_iteration": 2.395855188369751 }, { "auxiliary_loss_clip": 0.01096124, "auxiliary_loss_mlp": 0.01040186, "balance_loss_clip": 1.02018285, "balance_loss_mlp": 1.0266664, "epoch": 0.18596122050202915, "flos": 17233791477120.0, "grad_norm": 2.3730589433562215, "language_loss": 0.93141162, "learning_rate": 3.6686110903494995e-06, "loss": 0.95277476, "num_input_tokens_seen": 66781130, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.6953125, "step": 3093, "time_per_iteration": 2.3353850841522217 }, { "auxiliary_loss_clip": 0.01094786, "auxiliary_loss_mlp": 0.01040432, "balance_loss_clip": 1.02232397, "balance_loss_mlp": 1.02821648, "epoch": 0.18602134375469712, "flos": 19019878277760.0, "grad_norm": 1.8480169426734527, "language_loss": 0.77004647, "learning_rate": 3.668402766247622e-06, "loss": 0.7913987, "num_input_tokens_seen": 66797535, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.6640625, "step": 3094, "time_per_iteration": 2.387707233428955 }, { "auxiliary_loss_clip": 0.01094195, "auxiliary_loss_mlp": 0.01037264, "balance_loss_clip": 1.01876283, "balance_loss_mlp": 1.02692091, "epoch": 0.1860814670073651, "flos": 50948229248640.0, "grad_norm": 1.6034813027980024, "language_loss": 0.69743431, "learning_rate": 3.6681943826047413e-06, "loss": 0.71874893, "num_input_tokens_seen": 66821720, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.671875, "step": 3095, "time_per_iteration": 2.652736186981201 }, { "auxiliary_loss_clip": 0.01094172, "auxiliary_loss_mlp": 0.01038766, "balance_loss_clip": 1.01911986, "balance_loss_mlp": 1.02603316, "epoch": 0.18614159026003307, "flos": 19389957356160.0, "grad_norm": 2.022373330053034, "language_loss": 0.80696297, "learning_rate": 3.6679859394282944e-06, "loss": 0.82829237, "num_input_tokens_seen": 66839060, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.6796875, "step": 3096, "time_per_iteration": 3.8538777828216553 }, { "auxiliary_loss_clip": 0.01092448, "auxiliary_loss_mlp": 0.010374, "balance_loss_clip": 1.01905358, "balance_loss_mlp": 1.02626419, "epoch": 0.18620171351270104, "flos": 21797707559040.0, "grad_norm": 2.027790195257226, "language_loss": 0.74775016, "learning_rate": 3.6677774367257194e-06, "loss": 0.76904869, "num_input_tokens_seen": 66857760, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.6640625, "step": 3097, "time_per_iteration": 2.3681282997131348 }, { "auxiliary_loss_clip": 0.01092863, "auxiliary_loss_mlp": 0.01035928, "balance_loss_clip": 1.0169971, "balance_loss_mlp": 1.02733111, "epoch": 0.186261836765369, "flos": 16361940205440.0, "grad_norm": 2.02847078941279, "language_loss": 0.65580666, "learning_rate": 3.6675688745044583e-06, "loss": 0.67709458, "num_input_tokens_seen": 66876460, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.65625, "step": 3098, "time_per_iteration": 2.395278215408325 }, { "auxiliary_loss_clip": 0.01092862, "auxiliary_loss_mlp": 0.01038471, "balance_loss_clip": 1.01805055, "balance_loss_mlp": 1.02456856, "epoch": 0.18632196001803697, "flos": 23368868830080.0, "grad_norm": 1.7684825156704067, "language_loss": 0.6959098, "learning_rate": 3.6673602527719533e-06, "loss": 0.71722305, "num_input_tokens_seen": 66897960, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.68359375, "step": 3099, "time_per_iteration": 5.235820531845093 }, { "auxiliary_loss_clip": 0.01095068, "auxiliary_loss_mlp": 0.0104164, "balance_loss_clip": 1.02186322, "balance_loss_mlp": 1.02683389, "epoch": 0.18638208327070493, "flos": 22490908070400.0, "grad_norm": 1.54436392584293, "language_loss": 0.71356487, "learning_rate": 3.66715157153565e-06, "loss": 0.73493195, "num_input_tokens_seen": 66917675, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.6796875, "step": 3100, "time_per_iteration": 2.3982720375061035 }, { "auxiliary_loss_clip": 0.01094381, "auxiliary_loss_mlp": 0.01045709, "balance_loss_clip": 1.02631426, "balance_loss_mlp": 1.02628279, "epoch": 0.18644220652337293, "flos": 29164067297280.0, "grad_norm": 1.9548442918114333, "language_loss": 0.80125928, "learning_rate": 3.666942830802996e-06, "loss": 0.82266021, "num_input_tokens_seen": 66936000, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.6796875, "step": 3101, "time_per_iteration": 3.767545223236084 }, { "auxiliary_loss_clip": 0.01090047, "auxiliary_loss_mlp": 0.01035671, "balance_loss_clip": 1.01759875, "balance_loss_mlp": 1.02579045, "epoch": 0.1865023297760409, "flos": 24242640226560.0, "grad_norm": 1.8817863431958264, "language_loss": 0.76819777, "learning_rate": 3.6667340305814394e-06, "loss": 0.78945494, "num_input_tokens_seen": 66955700, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.640625, "step": 3102, "time_per_iteration": 2.4271433353424072 }, { "auxiliary_loss_clip": 0.01095077, "auxiliary_loss_mlp": 0.01033553, "balance_loss_clip": 1.01529002, "balance_loss_mlp": 1.02549887, "epoch": 0.18656245302870886, "flos": 19127899624320.0, "grad_norm": 2.3303646098086364, "language_loss": 0.76854289, "learning_rate": 3.6665251708784325e-06, "loss": 0.78982925, "num_input_tokens_seen": 66972815, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.6953125, "step": 3103, "time_per_iteration": 2.3667640686035156 }, { "auxiliary_loss_clip": 0.01094081, "auxiliary_loss_mlp": 0.01040071, "balance_loss_clip": 1.0222609, "balance_loss_mlp": 1.0266794, "epoch": 0.18662257628137682, "flos": 17785104756480.0, "grad_norm": 1.6592553548614029, "language_loss": 0.79195917, "learning_rate": 3.6663162517014294e-06, "loss": 0.81330061, "num_input_tokens_seen": 66992280, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.67578125, "step": 3104, "time_per_iteration": 2.387533187866211 }, { "auxiliary_loss_clip": 0.01093203, "auxiliary_loss_mlp": 0.01032621, "balance_loss_clip": 1.01515698, "balance_loss_mlp": 1.0283078, "epoch": 0.1866826995340448, "flos": 24023246042880.0, "grad_norm": 2.181170976821608, "language_loss": 0.85263824, "learning_rate": 3.6661072730578858e-06, "loss": 0.87389648, "num_input_tokens_seen": 67012220, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.6484375, "step": 3105, "time_per_iteration": 2.407792568206787 }, { "auxiliary_loss_clip": 0.01095813, "auxiliary_loss_mlp": 0.01037537, "balance_loss_clip": 1.01649654, "balance_loss_mlp": 1.02444923, "epoch": 0.18674282278671275, "flos": 26140030041600.0, "grad_norm": 2.084470877677171, "language_loss": 0.86739075, "learning_rate": 3.665898234955259e-06, "loss": 0.88872427, "num_input_tokens_seen": 67032030, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.71484375, "step": 3106, "time_per_iteration": 2.4420242309570312 }, { "auxiliary_loss_clip": 0.01094503, "auxiliary_loss_mlp": 0.01038425, "balance_loss_clip": 1.01956582, "balance_loss_mlp": 1.02600205, "epoch": 0.18680294603938072, "flos": 19201112478720.0, "grad_norm": 1.9136123432418746, "language_loss": 0.78331274, "learning_rate": 3.6656891374010097e-06, "loss": 0.80464196, "num_input_tokens_seen": 67048920, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.6875, "step": 3107, "time_per_iteration": 2.356750726699829 }, { "auxiliary_loss_clip": 0.01094176, "auxiliary_loss_mlp": 0.0104153, "balance_loss_clip": 1.02033496, "balance_loss_mlp": 1.02441061, "epoch": 0.1868630692920487, "flos": 28543730526720.0, "grad_norm": 2.008652235562938, "language_loss": 0.73930967, "learning_rate": 3.665479980402599e-06, "loss": 0.76066679, "num_input_tokens_seen": 67068645, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.6953125, "step": 3108, "time_per_iteration": 2.44694185256958 }, { "auxiliary_loss_clip": 0.01094503, "auxiliary_loss_mlp": 0.01035724, "balance_loss_clip": 1.01715183, "balance_loss_mlp": 1.02770066, "epoch": 0.18692319254471668, "flos": 17237073144960.0, "grad_norm": 1.7324864186310476, "language_loss": 0.74367827, "learning_rate": 3.665270763967493e-06, "loss": 0.76498055, "num_input_tokens_seen": 67087075, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.66796875, "step": 3109, "time_per_iteration": 2.361837387084961 }, { "auxiliary_loss_clip": 0.01091546, "auxiliary_loss_mlp": 0.01034762, "balance_loss_clip": 1.01597452, "balance_loss_mlp": 1.02526867, "epoch": 0.18698331579738464, "flos": 23184073670400.0, "grad_norm": 1.6227875736907937, "language_loss": 0.84263664, "learning_rate": 3.6650614881031567e-06, "loss": 0.86389971, "num_input_tokens_seen": 67108040, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.6640625, "step": 3110, "time_per_iteration": 2.40545916557312 }, { "auxiliary_loss_clip": 0.01095176, "auxiliary_loss_mlp": 0.01040211, "balance_loss_clip": 1.02092266, "balance_loss_mlp": 1.02745783, "epoch": 0.1870434390500526, "flos": 25515643553280.0, "grad_norm": 2.1454489632159643, "language_loss": 0.84406185, "learning_rate": 3.664852152817059e-06, "loss": 0.86541569, "num_input_tokens_seen": 67127605, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.67578125, "step": 3111, "time_per_iteration": 2.4096744060516357 }, { "auxiliary_loss_clip": 0.01091842, "auxiliary_loss_mlp": 0.01036712, "balance_loss_clip": 1.01865137, "balance_loss_mlp": 1.0261302, "epoch": 0.18710356230272057, "flos": 19499794093440.0, "grad_norm": 1.9619282300149858, "language_loss": 0.76870215, "learning_rate": 3.6646427581166702e-06, "loss": 0.78998768, "num_input_tokens_seen": 67145785, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.65625, "step": 3112, "time_per_iteration": 2.378523111343384 }, { "auxiliary_loss_clip": 0.0109347, "auxiliary_loss_mlp": 0.01037504, "balance_loss_clip": 1.01946771, "balance_loss_mlp": 1.02579355, "epoch": 0.18716368555538854, "flos": 26759633673600.0, "grad_norm": 2.0331328031005156, "language_loss": 0.64472282, "learning_rate": 3.6644333040094636e-06, "loss": 0.66603267, "num_input_tokens_seen": 67165930, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.67578125, "step": 3113, "time_per_iteration": 2.4664793014526367 }, { "auxiliary_loss_clip": 0.0109792, "auxiliary_loss_mlp": 0.01032934, "balance_loss_clip": 1.01378942, "balance_loss_mlp": 1.02778101, "epoch": 0.1872238088080565, "flos": 25188716782080.0, "grad_norm": 3.8465490341367548, "language_loss": 0.81099665, "learning_rate": 3.6642237905029132e-06, "loss": 0.83230519, "num_input_tokens_seen": 67185830, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.703125, "step": 3114, "time_per_iteration": 2.439657688140869 }, { "auxiliary_loss_clip": 0.01094659, "auxiliary_loss_mlp": 0.01041677, "balance_loss_clip": 1.02135181, "balance_loss_mlp": 1.02711296, "epoch": 0.1872839320607245, "flos": 24133152602880.0, "grad_norm": 1.890025172784249, "language_loss": 0.57458973, "learning_rate": 3.664014217604497e-06, "loss": 0.59595311, "num_input_tokens_seen": 67206930, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.67578125, "step": 3115, "time_per_iteration": 2.4208052158355713 }, { "auxiliary_loss_clip": 0.01091956, "auxiliary_loss_mlp": 0.01034217, "balance_loss_clip": 1.01615715, "balance_loss_mlp": 1.02803063, "epoch": 0.18734405531339246, "flos": 21172867223040.0, "grad_norm": 2.0238768270140497, "language_loss": 0.71198618, "learning_rate": 3.6638045853216938e-06, "loss": 0.733248, "num_input_tokens_seen": 67226290, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.640625, "step": 3116, "time_per_iteration": 2.40444016456604 }, { "auxiliary_loss_clip": 0.01089161, "auxiliary_loss_mlp": 0.01030626, "balance_loss_clip": 1.01394904, "balance_loss_mlp": 1.02528214, "epoch": 0.18740417856606043, "flos": 17236758942720.0, "grad_norm": 1.9591007837706198, "language_loss": 0.78899264, "learning_rate": 3.663594893661985e-06, "loss": 0.81019044, "num_input_tokens_seen": 67244410, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.640625, "step": 3117, "time_per_iteration": 2.3701653480529785 }, { "auxiliary_loss_clip": 0.01092906, "auxiliary_loss_mlp": 0.01033878, "balance_loss_clip": 1.01637793, "balance_loss_mlp": 1.02760911, "epoch": 0.1874643018187284, "flos": 32556787176960.0, "grad_norm": 1.7784577775703345, "language_loss": 0.84191912, "learning_rate": 3.663385142632853e-06, "loss": 0.86318696, "num_input_tokens_seen": 67264470, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.65234375, "step": 3118, "time_per_iteration": 2.495004177093506 }, { "auxiliary_loss_clip": 0.01092904, "auxiliary_loss_mlp": 0.01032003, "balance_loss_clip": 1.01405048, "balance_loss_mlp": 1.0258255, "epoch": 0.18752442507139636, "flos": 23257042145280.0, "grad_norm": 10.896413965969675, "language_loss": 0.76097798, "learning_rate": 3.663175332241785e-06, "loss": 0.78222704, "num_input_tokens_seen": 67284315, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.66796875, "step": 3119, "time_per_iteration": 2.3944449424743652 }, { "auxiliary_loss_clip": 0.01094838, "auxiliary_loss_mlp": 0.01039663, "balance_loss_clip": 1.02112603, "balance_loss_mlp": 1.02713871, "epoch": 0.18758454832406432, "flos": 21759896689920.0, "grad_norm": 1.9147952443760252, "language_loss": 0.82168788, "learning_rate": 3.6629654624962666e-06, "loss": 0.84303284, "num_input_tokens_seen": 67302780, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.6796875, "step": 3120, "time_per_iteration": 2.40030574798584 }, { "auxiliary_loss_clip": 0.01089715, "auxiliary_loss_mlp": 0.01034538, "balance_loss_clip": 1.01741982, "balance_loss_mlp": 1.02592897, "epoch": 0.1876446715767323, "flos": 29568919956480.0, "grad_norm": 2.0956589745081087, "language_loss": 0.85304511, "learning_rate": 3.6627555334037893e-06, "loss": 0.87428761, "num_input_tokens_seen": 67323405, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.63671875, "step": 3121, "time_per_iteration": 2.442066192626953 }, { "auxiliary_loss_clip": 0.01092619, "auxiliary_loss_mlp": 0.01037071, "balance_loss_clip": 1.0192734, "balance_loss_mlp": 1.0265708, "epoch": 0.18770479482940028, "flos": 30338580078720.0, "grad_norm": 1.771444275332751, "language_loss": 0.70667934, "learning_rate": 3.662545544971844e-06, "loss": 0.72797626, "num_input_tokens_seen": 67345800, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.66015625, "step": 3122, "time_per_iteration": 2.4791295528411865 }, { "auxiliary_loss_clip": 0.01088481, "auxiliary_loss_mlp": 0.01035789, "balance_loss_clip": 1.01690674, "balance_loss_mlp": 1.02441263, "epoch": 0.18776491808206824, "flos": 14464480567680.0, "grad_norm": 2.354608206212646, "language_loss": 0.70926332, "learning_rate": 3.662335497207924e-06, "loss": 0.73050606, "num_input_tokens_seen": 67363575, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.640625, "step": 3123, "time_per_iteration": 2.3511626720428467 }, { "auxiliary_loss_clip": 0.01090677, "auxiliary_loss_mlp": 0.01035649, "balance_loss_clip": 1.01949656, "balance_loss_mlp": 1.02599788, "epoch": 0.1878250413347362, "flos": 24497401484160.0, "grad_norm": 1.9326721085903336, "language_loss": 0.74157596, "learning_rate": 3.662125390119527e-06, "loss": 0.76283926, "num_input_tokens_seen": 67381765, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.6484375, "step": 3124, "time_per_iteration": 2.408418655395508 }, { "auxiliary_loss_clip": 0.01092443, "auxiliary_loss_mlp": 0.01036397, "balance_loss_clip": 1.01840878, "balance_loss_mlp": 1.02593827, "epoch": 0.18788516458740417, "flos": 39784611173760.0, "grad_norm": 1.6700523984093973, "language_loss": 0.8071084, "learning_rate": 3.66191522371415e-06, "loss": 0.8283968, "num_input_tokens_seen": 67405000, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.6640625, "step": 3125, "time_per_iteration": 2.548044443130493 }, { "auxiliary_loss_clip": 0.01028515, "auxiliary_loss_mlp": 0.01011871, "balance_loss_clip": 1.00909352, "balance_loss_mlp": 1.00887442, "epoch": 0.18794528784007214, "flos": 64696151738880.0, "grad_norm": 0.9630832536964263, "language_loss": 0.63649619, "learning_rate": 3.6617049979992937e-06, "loss": 0.65690005, "num_input_tokens_seen": 67467140, "router_z_loss_clip": 0.02783203, "router_z_loss_mlp": 0.19628906, "step": 3126, "time_per_iteration": 3.0950701236724854 }, { "auxiliary_loss_clip": 0.01090555, "auxiliary_loss_mlp": 0.01034912, "balance_loss_clip": 1.01673269, "balance_loss_mlp": 1.02696335, "epoch": 0.1880054110927401, "flos": 28620783630720.0, "grad_norm": 1.6531561448667726, "language_loss": 0.81115246, "learning_rate": 3.6614947129824603e-06, "loss": 0.83240718, "num_input_tokens_seen": 67487980, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.63671875, "step": 3127, "time_per_iteration": 2.4451937675476074 }, { "auxiliary_loss_clip": 0.01026604, "auxiliary_loss_mlp": 0.01004844, "balance_loss_clip": 1.00189924, "balance_loss_mlp": 1.00730681, "epoch": 0.1880655343454081, "flos": 64485625040640.0, "grad_norm": 0.7640370653681977, "language_loss": 0.61857343, "learning_rate": 3.6612843686711542e-06, "loss": 0.63888794, "num_input_tokens_seen": 67552500, "router_z_loss_clip": 0.02941895, "router_z_loss_mlp": 0.19335938, "step": 3128, "time_per_iteration": 3.1438512802124023 }, { "auxiliary_loss_clip": 0.01095201, "auxiliary_loss_mlp": 0.01029671, "balance_loss_clip": 1.0115037, "balance_loss_mlp": 1.02658224, "epoch": 0.18812565759807606, "flos": 32123095868160.0, "grad_norm": 2.1786778699681593, "language_loss": 0.70593059, "learning_rate": 3.661073965072883e-06, "loss": 0.72717929, "num_input_tokens_seen": 67573295, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.6875, "step": 3129, "time_per_iteration": 2.520672559738159 }, { "auxiliary_loss_clip": 0.01094378, "auxiliary_loss_mlp": 0.01040439, "balance_loss_clip": 1.02124667, "balance_loss_mlp": 1.02671683, "epoch": 0.18818578085074403, "flos": 20623683536640.0, "grad_norm": 2.4633057488059817, "language_loss": 0.85206509, "learning_rate": 3.6608635021951546e-06, "loss": 0.87341321, "num_input_tokens_seen": 67590010, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.67578125, "step": 3130, "time_per_iteration": 2.4170432090759277 }, { "auxiliary_loss_clip": 0.01093009, "auxiliary_loss_mlp": 0.01035344, "balance_loss_clip": 1.01569831, "balance_loss_mlp": 1.025419, "epoch": 0.188245904103412, "flos": 28839235207680.0, "grad_norm": 2.0859429617400593, "language_loss": 0.77010924, "learning_rate": 3.6606529800454794e-06, "loss": 0.7913928, "num_input_tokens_seen": 67611110, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.67578125, "step": 3131, "time_per_iteration": 2.4445645809173584 }, { "auxiliary_loss_clip": 0.01091879, "auxiliary_loss_mlp": 0.01037633, "balance_loss_clip": 1.0196805, "balance_loss_mlp": 1.02741194, "epoch": 0.18830602735607996, "flos": 29419142757120.0, "grad_norm": 2.053265037194725, "language_loss": 0.81552517, "learning_rate": 3.660442398631372e-06, "loss": 0.83682024, "num_input_tokens_seen": 67631990, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.64453125, "step": 3132, "time_per_iteration": 2.4377148151397705 }, { "auxiliary_loss_clip": 0.01094441, "auxiliary_loss_mlp": 0.01040242, "balance_loss_clip": 1.02169347, "balance_loss_mlp": 1.02709687, "epoch": 0.18836615060874792, "flos": 28871774461440.0, "grad_norm": 2.180847825789763, "language_loss": 0.79780543, "learning_rate": 3.660231757960346e-06, "loss": 0.81915224, "num_input_tokens_seen": 67650490, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.671875, "step": 3133, "time_per_iteration": 2.4449267387390137 }, { "auxiliary_loss_clip": 0.01093615, "auxiliary_loss_mlp": 0.01043389, "balance_loss_clip": 1.02441084, "balance_loss_mlp": 1.02740383, "epoch": 0.18842627386141592, "flos": 22600570250880.0, "grad_norm": 11.22870173067583, "language_loss": 0.82609212, "learning_rate": 3.660021058039919e-06, "loss": 0.84746218, "num_input_tokens_seen": 67668860, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.66015625, "step": 3134, "time_per_iteration": 2.400921583175659 }, { "auxiliary_loss_clip": 0.01092727, "auxiliary_loss_mlp": 0.01038447, "balance_loss_clip": 1.01982653, "balance_loss_mlp": 1.02743077, "epoch": 0.18848639711408388, "flos": 24572394817920.0, "grad_norm": 1.5148271158508548, "language_loss": 0.8306362, "learning_rate": 3.659810298877611e-06, "loss": 0.8519479, "num_input_tokens_seen": 67690220, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.65234375, "step": 3135, "time_per_iteration": 3.787660598754883 }, { "auxiliary_loss_clip": 0.01098167, "auxiliary_loss_mlp": 0.010379, "balance_loss_clip": 1.01861191, "balance_loss_mlp": 1.02853096, "epoch": 0.18854652036675185, "flos": 34165514937600.0, "grad_norm": 2.004547343841207, "language_loss": 0.78512704, "learning_rate": 3.659599480480943e-06, "loss": 0.80648768, "num_input_tokens_seen": 67709820, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.6953125, "step": 3136, "time_per_iteration": 2.479468822479248 }, { "auxiliary_loss_clip": 0.01093813, "auxiliary_loss_mlp": 0.0104207, "balance_loss_clip": 1.02251983, "balance_loss_mlp": 1.02774096, "epoch": 0.1886066436194198, "flos": 24199278451200.0, "grad_norm": 2.0835368106190146, "language_loss": 0.81216836, "learning_rate": 3.659388602857438e-06, "loss": 0.83352721, "num_input_tokens_seen": 67729490, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.66015625, "step": 3137, "time_per_iteration": 2.4111809730529785 }, { "auxiliary_loss_clip": 0.01095639, "auxiliary_loss_mlp": 0.01036815, "balance_loss_clip": 1.01908875, "balance_loss_mlp": 1.02895725, "epoch": 0.18866676687208778, "flos": 21250059972480.0, "grad_norm": 1.5100388369519946, "language_loss": 0.80736995, "learning_rate": 3.6591776660146225e-06, "loss": 0.82869452, "num_input_tokens_seen": 67749665, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.66796875, "step": 3138, "time_per_iteration": 3.7891883850097656 }, { "auxiliary_loss_clip": 0.01095721, "auxiliary_loss_mlp": 0.01039131, "balance_loss_clip": 1.02066553, "balance_loss_mlp": 1.02673495, "epoch": 0.18872689012475574, "flos": 37307069429760.0, "grad_norm": 2.0378947896863555, "language_loss": 0.63375771, "learning_rate": 3.6589666699600247e-06, "loss": 0.65510619, "num_input_tokens_seen": 67776230, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.69140625, "step": 3139, "time_per_iteration": 3.908792495727539 }, { "auxiliary_loss_clip": 0.01092315, "auxiliary_loss_mlp": 0.01037282, "balance_loss_clip": 1.017326, "balance_loss_mlp": 1.02600133, "epoch": 0.1887870133774237, "flos": 21651246938880.0, "grad_norm": 2.4065564903787893, "language_loss": 0.71284431, "learning_rate": 3.6587556147011728e-06, "loss": 0.73414028, "num_input_tokens_seen": 67795080, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.6640625, "step": 3140, "time_per_iteration": 3.7559385299682617 }, { "auxiliary_loss_clip": 0.01094735, "auxiliary_loss_mlp": 0.01037656, "balance_loss_clip": 1.01778328, "balance_loss_mlp": 1.02658761, "epoch": 0.1888471366300917, "flos": 15923745331200.0, "grad_norm": 2.4677283334453546, "language_loss": 0.87063736, "learning_rate": 3.6585445002456004e-06, "loss": 0.89196122, "num_input_tokens_seen": 67813110, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.6796875, "step": 3141, "time_per_iteration": 2.3590939044952393 }, { "auxiliary_loss_clip": 0.01096879, "auxiliary_loss_mlp": 0.01036378, "balance_loss_clip": 1.01556396, "balance_loss_mlp": 1.0269196, "epoch": 0.18890725988275966, "flos": 18550959540480.0, "grad_norm": 1.8405154147025118, "language_loss": 0.7696079, "learning_rate": 3.6583333266008404e-06, "loss": 0.79094052, "num_input_tokens_seen": 67831070, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.69921875, "step": 3142, "time_per_iteration": 2.342622756958008 }, { "auxiliary_loss_clip": 0.01091897, "auxiliary_loss_mlp": 0.01035272, "balance_loss_clip": 1.01705718, "balance_loss_mlp": 1.02641535, "epoch": 0.18896738313542763, "flos": 28839584321280.0, "grad_norm": 1.7804266465807372, "language_loss": 0.78882277, "learning_rate": 3.6581220937744305e-06, "loss": 0.81009448, "num_input_tokens_seen": 67852170, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.65234375, "step": 3143, "time_per_iteration": 2.4450275897979736 }, { "auxiliary_loss_clip": 0.01094285, "auxiliary_loss_mlp": 0.01040665, "balance_loss_clip": 1.02175856, "balance_loss_mlp": 1.02750754, "epoch": 0.1890275063880956, "flos": 22411830107520.0, "grad_norm": 2.287618186149079, "language_loss": 0.71571839, "learning_rate": 3.6579108017739076e-06, "loss": 0.73706782, "num_input_tokens_seen": 67869945, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.66796875, "step": 3144, "time_per_iteration": 2.3720335960388184 }, { "auxiliary_loss_clip": 0.01094998, "auxiliary_loss_mlp": 0.01037076, "balance_loss_clip": 1.01803827, "balance_loss_mlp": 1.02687371, "epoch": 0.18908762964076356, "flos": 24242744960640.0, "grad_norm": 2.6075958756325393, "language_loss": 0.73052001, "learning_rate": 3.6576994506068136e-06, "loss": 0.75184077, "num_input_tokens_seen": 67890240, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.6796875, "step": 3145, "time_per_iteration": 2.4356400966644287 }, { "auxiliary_loss_clip": 0.0109119, "auxiliary_loss_mlp": 0.01034824, "balance_loss_clip": 1.01741934, "balance_loss_mlp": 1.02506208, "epoch": 0.18914775289343153, "flos": 16981962773760.0, "grad_norm": 2.683121301152449, "language_loss": 0.76936823, "learning_rate": 3.6574880402806897e-06, "loss": 0.79062837, "num_input_tokens_seen": 67907825, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.6640625, "step": 3146, "time_per_iteration": 2.351532220840454 }, { "auxiliary_loss_clip": 0.01093525, "auxiliary_loss_mlp": 0.01039422, "balance_loss_clip": 1.02119517, "balance_loss_mlp": 1.02663589, "epoch": 0.1892078761460995, "flos": 21542701921920.0, "grad_norm": 2.1745879156237082, "language_loss": 0.78983533, "learning_rate": 3.6572765708030813e-06, "loss": 0.81116486, "num_input_tokens_seen": 67926670, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.66796875, "step": 3147, "time_per_iteration": 2.396304130554199 }, { "auxiliary_loss_clip": 0.01090724, "auxiliary_loss_mlp": 0.01032712, "balance_loss_clip": 1.01547456, "balance_loss_mlp": 1.02646947, "epoch": 0.18926799939876748, "flos": 23000465496960.0, "grad_norm": 2.8520065875250187, "language_loss": 0.66726327, "learning_rate": 3.657065042181536e-06, "loss": 0.68849766, "num_input_tokens_seen": 67943645, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.640625, "step": 3148, "time_per_iteration": 2.395343065261841 }, { "auxiliary_loss_clip": 0.01091736, "auxiliary_loss_mlp": 0.01030399, "balance_loss_clip": 1.01361406, "balance_loss_mlp": 1.02645969, "epoch": 0.18932812265143545, "flos": 22271932823040.0, "grad_norm": 2.445524490717879, "language_loss": 0.76157504, "learning_rate": 3.6568534544236008e-06, "loss": 0.78279638, "num_input_tokens_seen": 67962345, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.65234375, "step": 3149, "time_per_iteration": 2.3933093547821045 }, { "auxiliary_loss_clip": 0.01090073, "auxiliary_loss_mlp": 0.01036841, "balance_loss_clip": 1.02015185, "balance_loss_mlp": 1.02690399, "epoch": 0.1893882459041034, "flos": 18623439256320.0, "grad_norm": 3.363665441741508, "language_loss": 0.81101823, "learning_rate": 3.656641807536828e-06, "loss": 0.83228737, "num_input_tokens_seen": 67979760, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.6328125, "step": 3150, "time_per_iteration": 2.3639278411865234 }, { "auxiliary_loss_clip": 0.01095356, "auxiliary_loss_mlp": 0.01041358, "balance_loss_clip": 1.022928, "balance_loss_mlp": 1.02777815, "epoch": 0.18944836915677138, "flos": 22891885568640.0, "grad_norm": 2.0949354009812304, "language_loss": 0.84872854, "learning_rate": 3.6564301015287706e-06, "loss": 0.87009573, "num_input_tokens_seen": 67996895, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.67578125, "step": 3151, "time_per_iteration": 2.3968522548675537 }, { "auxiliary_loss_clip": 0.01095155, "auxiliary_loss_mlp": 0.01040186, "balance_loss_clip": 1.0226388, "balance_loss_mlp": 1.02819836, "epoch": 0.18950849240943934, "flos": 26795349861120.0, "grad_norm": 1.9176161239989238, "language_loss": 0.74011457, "learning_rate": 3.6562183364069835e-06, "loss": 0.76146793, "num_input_tokens_seen": 68018365, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.66796875, "step": 3152, "time_per_iteration": 2.4383933544158936 }, { "auxiliary_loss_clip": 0.01091615, "auxiliary_loss_mlp": 0.01039334, "balance_loss_clip": 1.02085638, "balance_loss_mlp": 1.02577949, "epoch": 0.1895686156621073, "flos": 24970125559680.0, "grad_norm": 1.8800007116162436, "language_loss": 0.75120592, "learning_rate": 3.6560065121790244e-06, "loss": 0.77251536, "num_input_tokens_seen": 68037985, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.65625, "step": 3153, "time_per_iteration": 2.4134364128112793 }, { "auxiliary_loss_clip": 0.01093964, "auxiliary_loss_mlp": 0.01037649, "balance_loss_clip": 1.01952863, "balance_loss_mlp": 1.0262568, "epoch": 0.1896287389147753, "flos": 21943469952000.0, "grad_norm": 5.572109106942339, "language_loss": 0.79413539, "learning_rate": 3.655794628852453e-06, "loss": 0.8154515, "num_input_tokens_seen": 68057975, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.6796875, "step": 3154, "time_per_iteration": 2.3982646465301514 }, { "auxiliary_loss_clip": 0.01094087, "auxiliary_loss_mlp": 0.01037787, "balance_loss_clip": 1.01841521, "balance_loss_mlp": 1.02587223, "epoch": 0.18968886216744327, "flos": 18178297021440.0, "grad_norm": 2.80094461298542, "language_loss": 0.72725987, "learning_rate": 3.6555826864348297e-06, "loss": 0.74857867, "num_input_tokens_seen": 68074175, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.6796875, "step": 3155, "time_per_iteration": 2.3434884548187256 }, { "auxiliary_loss_clip": 0.01089547, "auxiliary_loss_mlp": 0.01035331, "balance_loss_clip": 1.01737821, "balance_loss_mlp": 1.02359009, "epoch": 0.18974898542011123, "flos": 20411446181760.0, "grad_norm": 2.2744931621323725, "language_loss": 0.7401787, "learning_rate": 3.6553706849337197e-06, "loss": 0.76142752, "num_input_tokens_seen": 68095230, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.65625, "step": 3156, "time_per_iteration": 2.408158540725708 }, { "auxiliary_loss_clip": 0.01093336, "auxiliary_loss_mlp": 0.01032518, "balance_loss_clip": 1.01494694, "balance_loss_mlp": 1.02672505, "epoch": 0.1898091086727792, "flos": 23983968896640.0, "grad_norm": 1.781444490073804, "language_loss": 0.67989981, "learning_rate": 3.6551586243566877e-06, "loss": 0.7011584, "num_input_tokens_seen": 68113805, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.6640625, "step": 3157, "time_per_iteration": 2.396543502807617 }, { "auxiliary_loss_clip": 0.01091825, "auxiliary_loss_mlp": 0.01031665, "balance_loss_clip": 1.01412976, "balance_loss_mlp": 1.02482057, "epoch": 0.18986923192544716, "flos": 27635813953920.0, "grad_norm": 1.7151154497333212, "language_loss": 0.79707837, "learning_rate": 3.654946504711302e-06, "loss": 0.81831336, "num_input_tokens_seen": 68133190, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.671875, "step": 3158, "time_per_iteration": 2.4283323287963867 }, { "auxiliary_loss_clip": 0.01096707, "auxiliary_loss_mlp": 0.01039898, "balance_loss_clip": 1.01951361, "balance_loss_mlp": 1.02673757, "epoch": 0.18992935517811513, "flos": 25482964654080.0, "grad_norm": 2.665136737825096, "language_loss": 0.72027659, "learning_rate": 3.6547343260051323e-06, "loss": 0.74164271, "num_input_tokens_seen": 68152330, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.69921875, "step": 3159, "time_per_iteration": 2.4149372577667236 }, { "auxiliary_loss_clip": 0.01093615, "auxiliary_loss_mlp": 0.0104252, "balance_loss_clip": 1.02317274, "balance_loss_mlp": 1.02667046, "epoch": 0.1899894784307831, "flos": 17419843445760.0, "grad_norm": 2.4817875191132286, "language_loss": 0.85185206, "learning_rate": 3.6545220882457518e-06, "loss": 0.87321341, "num_input_tokens_seen": 68170185, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.671875, "step": 3160, "time_per_iteration": 2.370260238647461 }, { "auxiliary_loss_clip": 0.01088359, "auxiliary_loss_mlp": 0.01044831, "balance_loss_clip": 1.02783227, "balance_loss_mlp": 1.02554953, "epoch": 0.19004960168345109, "flos": 27490959256320.0, "grad_norm": 1.8624981820899977, "language_loss": 0.73385042, "learning_rate": 3.6543097914407336e-06, "loss": 0.75518227, "num_input_tokens_seen": 68191665, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.625, "step": 3161, "time_per_iteration": 2.4460105895996094 }, { "auxiliary_loss_clip": 0.01090908, "auxiliary_loss_mlp": 0.01041006, "balance_loss_clip": 1.0234704, "balance_loss_mlp": 1.02615905, "epoch": 0.19010972493611905, "flos": 38653145965440.0, "grad_norm": 1.8150751487237726, "language_loss": 0.80446106, "learning_rate": 3.6540974355976537e-06, "loss": 0.82578015, "num_input_tokens_seen": 68214635, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.6484375, "step": 3162, "time_per_iteration": 2.5258138179779053 }, { "auxiliary_loss_clip": 0.01092994, "auxiliary_loss_mlp": 0.01031712, "balance_loss_clip": 1.01316249, "balance_loss_mlp": 1.02582717, "epoch": 0.19016984818878702, "flos": 19243741115520.0, "grad_norm": 3.1610870978860692, "language_loss": 0.75388765, "learning_rate": 3.653885020724092e-06, "loss": 0.77513468, "num_input_tokens_seen": 68232150, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.671875, "step": 3163, "time_per_iteration": 2.3539695739746094 }, { "auxiliary_loss_clip": 0.01091513, "auxiliary_loss_mlp": 0.01038314, "balance_loss_clip": 1.02019417, "balance_loss_mlp": 1.02672601, "epoch": 0.19022997144145498, "flos": 37595382370560.0, "grad_norm": 2.5051610458785984, "language_loss": 0.74053907, "learning_rate": 3.653672546827628e-06, "loss": 0.76183736, "num_input_tokens_seen": 68253370, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.6484375, "step": 3164, "time_per_iteration": 2.5292487144470215 }, { "auxiliary_loss_clip": 0.01093023, "auxiliary_loss_mlp": 0.01028941, "balance_loss_clip": 1.01123857, "balance_loss_mlp": 1.02741444, "epoch": 0.19029009469412295, "flos": 61528762840320.0, "grad_norm": 1.4450338276097412, "language_loss": 0.66605741, "learning_rate": 3.653460013915844e-06, "loss": 0.68727708, "num_input_tokens_seen": 68278895, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.65625, "step": 3165, "time_per_iteration": 2.7367405891418457 }, { "auxiliary_loss_clip": 0.01095137, "auxiliary_loss_mlp": 0.01038974, "balance_loss_clip": 1.02067578, "balance_loss_mlp": 1.02840436, "epoch": 0.1903502179467909, "flos": 13953980534400.0, "grad_norm": 2.4984910791317807, "language_loss": 0.73748457, "learning_rate": 3.653247421996326e-06, "loss": 0.75882566, "num_input_tokens_seen": 68294880, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.6640625, "step": 3166, "time_per_iteration": 2.3404548168182373 }, { "auxiliary_loss_clip": 0.01027155, "auxiliary_loss_mlp": 0.01025743, "balance_loss_clip": 1.02271545, "balance_loss_mlp": 1.0086807, "epoch": 0.1904103411994589, "flos": 66896168152320.0, "grad_norm": 0.7936556718366062, "language_loss": 0.50340271, "learning_rate": 3.65303477107666e-06, "loss": 0.52393174, "num_input_tokens_seen": 68359665, "router_z_loss_clip": 0.03027344, "router_z_loss_mlp": 0.18457031, "step": 3167, "time_per_iteration": 3.0473318099975586 }, { "auxiliary_loss_clip": 0.01092101, "auxiliary_loss_mlp": 0.01034347, "balance_loss_clip": 1.01715732, "balance_loss_mlp": 1.02729797, "epoch": 0.19047046445212687, "flos": 21907649030400.0, "grad_norm": 1.9924075328792246, "language_loss": 0.7409988, "learning_rate": 3.6528220611644356e-06, "loss": 0.7622633, "num_input_tokens_seen": 68378950, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.6484375, "step": 3168, "time_per_iteration": 2.402815103530884 }, { "auxiliary_loss_clip": 0.01024807, "auxiliary_loss_mlp": 0.0101354, "balance_loss_clip": 1.01065552, "balance_loss_mlp": 1.00624621, "epoch": 0.19053058770479483, "flos": 59252424595200.0, "grad_norm": 0.8669423342558235, "language_loss": 0.6008268, "learning_rate": 3.652609292267242e-06, "loss": 0.62121028, "num_input_tokens_seen": 68434235, "router_z_loss_clip": 0.02880859, "router_z_loss_mlp": 0.18554688, "step": 3169, "time_per_iteration": 2.9471940994262695 }, { "auxiliary_loss_clip": 0.01095435, "auxiliary_loss_mlp": 0.01041426, "balance_loss_clip": 1.02373552, "balance_loss_mlp": 1.0265286, "epoch": 0.1905907109574628, "flos": 23950172833920.0, "grad_norm": 1.6654320331704824, "language_loss": 0.78398848, "learning_rate": 3.6523964643926754e-06, "loss": 0.8053571, "num_input_tokens_seen": 68453830, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.6875, "step": 3170, "time_per_iteration": 2.4139318466186523 }, { "auxiliary_loss_clip": 0.01089523, "auxiliary_loss_mlp": 0.01036114, "balance_loss_clip": 1.01806617, "balance_loss_mlp": 1.02465212, "epoch": 0.19065083421013077, "flos": 20811306516480.0, "grad_norm": 1.71655659388284, "language_loss": 0.78177553, "learning_rate": 3.6521835775483285e-06, "loss": 0.80303192, "num_input_tokens_seen": 68473005, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.6484375, "step": 3171, "time_per_iteration": 2.3902087211608887 }, { "auxiliary_loss_clip": 0.01093806, "auxiliary_loss_mlp": 0.0103914, "balance_loss_clip": 1.01995897, "balance_loss_mlp": 1.02565539, "epoch": 0.19071095746279873, "flos": 31283644204800.0, "grad_norm": 2.0296539216697793, "language_loss": 0.77943277, "learning_rate": 3.6519706317417995e-06, "loss": 0.8007623, "num_input_tokens_seen": 68493470, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.6796875, "step": 3172, "time_per_iteration": 2.4620447158813477 }, { "auxiliary_loss_clip": 0.01093869, "auxiliary_loss_mlp": 0.0104006, "balance_loss_clip": 1.02184463, "balance_loss_mlp": 1.02689338, "epoch": 0.1907710807154667, "flos": 14355237323520.0, "grad_norm": 7.969753133433176, "language_loss": 0.80303502, "learning_rate": 3.6517576269806885e-06, "loss": 0.82437432, "num_input_tokens_seen": 68511290, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.66796875, "step": 3173, "time_per_iteration": 2.364328384399414 }, { "auxiliary_loss_clip": 0.01093834, "auxiliary_loss_mlp": 0.01049215, "balance_loss_clip": 1.03058267, "balance_loss_mlp": 1.02608895, "epoch": 0.1908312039681347, "flos": 26905815002880.0, "grad_norm": 1.5726743213791063, "language_loss": 0.78732854, "learning_rate": 3.651544563272597e-06, "loss": 0.80875897, "num_input_tokens_seen": 68532575, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.6796875, "step": 3174, "time_per_iteration": 3.873528242111206 }, { "auxiliary_loss_clip": 0.0109644, "auxiliary_loss_mlp": 0.01044016, "balance_loss_clip": 1.02575302, "balance_loss_mlp": 1.02894258, "epoch": 0.19089132722080265, "flos": 14494017444480.0, "grad_norm": 2.560932419383946, "language_loss": 0.81298071, "learning_rate": 3.651331440625127e-06, "loss": 0.83438522, "num_input_tokens_seen": 68548760, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.671875, "step": 3175, "time_per_iteration": 2.3709824085235596 }, { "auxiliary_loss_clip": 0.01095595, "auxiliary_loss_mlp": 0.01047697, "balance_loss_clip": 1.02912462, "balance_loss_mlp": 1.02780724, "epoch": 0.19095145047347062, "flos": 13952060409600.0, "grad_norm": 2.1343172854609658, "language_loss": 0.85423797, "learning_rate": 3.651118259045887e-06, "loss": 0.87567091, "num_input_tokens_seen": 68563100, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.67578125, "step": 3176, "time_per_iteration": 2.352125883102417 }, { "auxiliary_loss_clip": 0.01097972, "auxiliary_loss_mlp": 0.01050253, "balance_loss_clip": 1.0299871, "balance_loss_mlp": 1.02877474, "epoch": 0.19101157372613858, "flos": 25300648200960.0, "grad_norm": 1.9692737698191998, "language_loss": 0.81437957, "learning_rate": 3.650905018542483e-06, "loss": 0.8358618, "num_input_tokens_seen": 68581650, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.69140625, "step": 3177, "time_per_iteration": 3.792844533920288 }, { "auxiliary_loss_clip": 0.01092264, "auxiliary_loss_mlp": 0.01036427, "balance_loss_clip": 1.01786613, "balance_loss_mlp": 1.02616823, "epoch": 0.19107169697880655, "flos": 20557173663360.0, "grad_norm": 2.7706199197200676, "language_loss": 0.74712181, "learning_rate": 3.650691719122525e-06, "loss": 0.76840878, "num_input_tokens_seen": 68600360, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.6640625, "step": 3178, "time_per_iteration": 3.7311582565307617 }, { "auxiliary_loss_clip": 0.01094936, "auxiliary_loss_mlp": 0.01036207, "balance_loss_clip": 1.01819444, "balance_loss_mlp": 1.02852631, "epoch": 0.19113182023147451, "flos": 22162130997120.0, "grad_norm": 1.6771813715315846, "language_loss": 0.81381947, "learning_rate": 3.6504783607936266e-06, "loss": 0.83513093, "num_input_tokens_seen": 68617885, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.6640625, "step": 3179, "time_per_iteration": 2.387749671936035 }, { "auxiliary_loss_clip": 0.01095876, "auxiliary_loss_mlp": 0.01035741, "balance_loss_clip": 1.01747799, "balance_loss_mlp": 1.02783322, "epoch": 0.19119194348414248, "flos": 18580985176320.0, "grad_norm": 3.571695730743239, "language_loss": 0.80022579, "learning_rate": 3.6502649435634006e-06, "loss": 0.82154197, "num_input_tokens_seen": 68634550, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.6796875, "step": 3180, "time_per_iteration": 3.7456233501434326 }, { "auxiliary_loss_clip": 0.01092087, "auxiliary_loss_mlp": 0.01042117, "balance_loss_clip": 1.02328157, "balance_loss_mlp": 1.02579308, "epoch": 0.19125206673681047, "flos": 19025603740800.0, "grad_norm": 2.4491028994390365, "language_loss": 0.79080421, "learning_rate": 3.6500514674394634e-06, "loss": 0.81214631, "num_input_tokens_seen": 68651895, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.6640625, "step": 3181, "time_per_iteration": 2.3571255207061768 }, { "auxiliary_loss_clip": 0.01094004, "auxiliary_loss_mlp": 0.01037111, "balance_loss_clip": 1.01886034, "balance_loss_mlp": 1.02600908, "epoch": 0.19131218998947844, "flos": 21689057808000.0, "grad_norm": 1.8849302621669406, "language_loss": 0.73793995, "learning_rate": 3.649837932429434e-06, "loss": 0.75925112, "num_input_tokens_seen": 68671500, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.6796875, "step": 3182, "time_per_iteration": 2.392728090286255 }, { "auxiliary_loss_clip": 0.01094247, "auxiliary_loss_mlp": 0.01040668, "balance_loss_clip": 1.02185655, "balance_loss_mlp": 1.02753794, "epoch": 0.1913723132421464, "flos": 18441506828160.0, "grad_norm": 1.7585453217167473, "language_loss": 0.64951855, "learning_rate": 3.649624338540933e-06, "loss": 0.67086768, "num_input_tokens_seen": 68690570, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.6640625, "step": 3183, "time_per_iteration": 2.3653316497802734 }, { "auxiliary_loss_clip": 0.01092735, "auxiliary_loss_mlp": 0.01043071, "balance_loss_clip": 1.02306771, "balance_loss_mlp": 1.0258553, "epoch": 0.19143243649481437, "flos": 27158935426560.0, "grad_norm": 1.5167240814876268, "language_loss": 0.73595703, "learning_rate": 3.649410685781582e-06, "loss": 0.75731504, "num_input_tokens_seen": 68709735, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.66796875, "step": 3184, "time_per_iteration": 2.432704210281372 }, { "auxiliary_loss_clip": 0.01092212, "auxiliary_loss_mlp": 0.01035316, "balance_loss_clip": 1.01520491, "balance_loss_mlp": 1.02478993, "epoch": 0.19149255974748233, "flos": 21718071014400.0, "grad_norm": 1.9895216461375365, "language_loss": 0.88315654, "learning_rate": 3.6491969741590075e-06, "loss": 0.90443182, "num_input_tokens_seen": 68727565, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.671875, "step": 3185, "time_per_iteration": 2.3759915828704834 }, { "auxiliary_loss_clip": 0.01092097, "auxiliary_loss_mlp": 0.01033265, "balance_loss_clip": 1.0140481, "balance_loss_mlp": 1.02576983, "epoch": 0.1915526830001503, "flos": 22962270602880.0, "grad_norm": 2.0860743889738442, "language_loss": 0.72633183, "learning_rate": 3.648983203680834e-06, "loss": 0.74758548, "num_input_tokens_seen": 68748110, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.6640625, "step": 3186, "time_per_iteration": 2.4003496170043945 }, { "auxiliary_loss_clip": 0.01096054, "auxiliary_loss_mlp": 0.0103823, "balance_loss_clip": 1.01659393, "balance_loss_mlp": 1.02652895, "epoch": 0.1916128062528183, "flos": 26139541282560.0, "grad_norm": 1.784433829999298, "language_loss": 0.83411252, "learning_rate": 3.6487693743546927e-06, "loss": 0.8554554, "num_input_tokens_seen": 68769765, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 0.6953125, "step": 3187, "time_per_iteration": 2.4265408515930176 }, { "auxiliary_loss_clip": 0.01027159, "auxiliary_loss_mlp": 0.01014839, "balance_loss_clip": 1.01260972, "balance_loss_mlp": 1.00835872, "epoch": 0.19167292950548626, "flos": 54922809847680.0, "grad_norm": 0.853340881450663, "language_loss": 0.55857521, "learning_rate": 3.648555486188213e-06, "loss": 0.57899523, "num_input_tokens_seen": 68826815, "router_z_loss_clip": 0.02233887, "router_z_loss_mlp": 0.1875, "step": 3188, "time_per_iteration": 3.071138858795166 }, { "auxiliary_loss_clip": 0.01093198, "auxiliary_loss_mlp": 0.01037988, "balance_loss_clip": 1.01933169, "balance_loss_mlp": 1.02743053, "epoch": 0.19173305275815422, "flos": 29934286001280.0, "grad_norm": 1.6054637380264414, "language_loss": 0.70125937, "learning_rate": 3.648341539189029e-06, "loss": 0.72257119, "num_input_tokens_seen": 68847585, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.65625, "step": 3189, "time_per_iteration": 2.4534246921539307 }, { "auxiliary_loss_clip": 0.01088219, "auxiliary_loss_mlp": 0.01031638, "balance_loss_clip": 1.01462686, "balance_loss_mlp": 1.02538657, "epoch": 0.1917931760108222, "flos": 24751359780480.0, "grad_norm": 3.424436763277206, "language_loss": 0.74134934, "learning_rate": 3.648127533364775e-06, "loss": 0.76254797, "num_input_tokens_seen": 68866620, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.62890625, "step": 3190, "time_per_iteration": 2.4190168380737305 }, { "auxiliary_loss_clip": 0.01092916, "auxiliary_loss_mlp": 0.01048443, "balance_loss_clip": 1.02938151, "balance_loss_mlp": 1.02776313, "epoch": 0.19185329926349015, "flos": 18842554149120.0, "grad_norm": 3.6163422025086005, "language_loss": 0.8435185, "learning_rate": 3.6479134687230887e-06, "loss": 0.86493206, "num_input_tokens_seen": 68885515, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.6484375, "step": 3191, "time_per_iteration": 2.377803087234497 }, { "auxiliary_loss_clip": 0.01090419, "auxiliary_loss_mlp": 0.01032266, "balance_loss_clip": 1.01468277, "balance_loss_mlp": 1.0269171, "epoch": 0.19191342251615812, "flos": 22085880854400.0, "grad_norm": 1.8367942314446566, "language_loss": 0.89690745, "learning_rate": 3.64769934527161e-06, "loss": 0.91813433, "num_input_tokens_seen": 68903225, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.6328125, "step": 3192, "time_per_iteration": 2.38694167137146 }, { "auxiliary_loss_clip": 0.01095632, "auxiliary_loss_mlp": 0.01041773, "balance_loss_clip": 1.02150786, "balance_loss_mlp": 1.02851701, "epoch": 0.19197354576882608, "flos": 22198056652800.0, "grad_norm": 1.7812500685174586, "language_loss": 0.74489391, "learning_rate": 3.64748516301798e-06, "loss": 0.7662679, "num_input_tokens_seen": 68922860, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.671875, "step": 3193, "time_per_iteration": 2.389866352081299 }, { "auxiliary_loss_clip": 0.01096063, "auxiliary_loss_mlp": 0.0103819, "balance_loss_clip": 1.01852024, "balance_loss_mlp": 1.02679372, "epoch": 0.19203366902149407, "flos": 24895132225920.0, "grad_norm": 1.7237964102702663, "language_loss": 0.7463479, "learning_rate": 3.6472709219698422e-06, "loss": 0.76769042, "num_input_tokens_seen": 68943000, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.69140625, "step": 3194, "time_per_iteration": 2.419905185699463 }, { "auxiliary_loss_clip": 0.01022737, "auxiliary_loss_mlp": 0.01002894, "balance_loss_clip": 1.00068891, "balance_loss_mlp": 1.00399566, "epoch": 0.19209379227416204, "flos": 68413633885440.0, "grad_norm": 0.7852784648745245, "language_loss": 0.68454325, "learning_rate": 3.647056622134843e-06, "loss": 0.70479953, "num_input_tokens_seen": 69000255, "router_z_loss_clip": 0.02209473, "router_z_loss_mlp": 0.1875, "step": 3195, "time_per_iteration": 2.9521589279174805 }, { "auxiliary_loss_clip": 0.0109407, "auxiliary_loss_mlp": 0.01042562, "balance_loss_clip": 1.02328563, "balance_loss_mlp": 1.02698195, "epoch": 0.19215391552683, "flos": 22054074739200.0, "grad_norm": 2.5447857934723115, "language_loss": 0.72515213, "learning_rate": 3.6468422635206297e-06, "loss": 0.74651849, "num_input_tokens_seen": 69019665, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.671875, "step": 3196, "time_per_iteration": 2.3983092308044434 }, { "auxiliary_loss_clip": 0.01096925, "auxiliary_loss_mlp": 0.01042756, "balance_loss_clip": 1.02365923, "balance_loss_mlp": 1.03062785, "epoch": 0.19221403877949797, "flos": 20301923646720.0, "grad_norm": 2.1716943994036444, "language_loss": 0.83250105, "learning_rate": 3.6466278461348514e-06, "loss": 0.85389781, "num_input_tokens_seen": 69039055, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.6640625, "step": 3197, "time_per_iteration": 2.3770110607147217 }, { "auxiliary_loss_clip": 0.01092871, "auxiliary_loss_mlp": 0.01035991, "balance_loss_clip": 1.0170486, "balance_loss_mlp": 1.02595544, "epoch": 0.19227416203216594, "flos": 23184213315840.0, "grad_norm": 2.105888060625776, "language_loss": 0.80370164, "learning_rate": 3.646413369985161e-06, "loss": 0.82499027, "num_input_tokens_seen": 69056370, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.66796875, "step": 3198, "time_per_iteration": 2.3984158039093018 }, { "auxiliary_loss_clip": 0.01094667, "auxiliary_loss_mlp": 0.01039524, "balance_loss_clip": 1.01867473, "balance_loss_mlp": 1.02617788, "epoch": 0.1923342852848339, "flos": 25775397135360.0, "grad_norm": 2.109748742392438, "language_loss": 0.78280067, "learning_rate": 3.6461988350792137e-06, "loss": 0.80414265, "num_input_tokens_seen": 69075915, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.6875, "step": 3199, "time_per_iteration": 2.41072940826416 }, { "auxiliary_loss_clip": 0.01094885, "auxiliary_loss_mlp": 0.01036543, "balance_loss_clip": 1.0186615, "balance_loss_mlp": 1.02991319, "epoch": 0.19239440853750187, "flos": 17127410964480.0, "grad_norm": 2.4234828034627993, "language_loss": 0.83533007, "learning_rate": 3.6459842414246636e-06, "loss": 0.85664433, "num_input_tokens_seen": 69094145, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.6484375, "step": 3200, "time_per_iteration": 2.351614475250244 }, { "auxiliary_loss_clip": 0.01095715, "auxiliary_loss_mlp": 0.01044843, "balance_loss_clip": 1.02640176, "balance_loss_mlp": 1.02879262, "epoch": 0.19245453179016986, "flos": 16434175541760.0, "grad_norm": 2.052054101014935, "language_loss": 0.79116702, "learning_rate": 3.6457695890291697e-06, "loss": 0.8125726, "num_input_tokens_seen": 69111110, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.66796875, "step": 3201, "time_per_iteration": 2.348501682281494 }, { "auxiliary_loss_clip": 0.01095271, "auxiliary_loss_mlp": 0.01037754, "balance_loss_clip": 1.01831102, "balance_loss_mlp": 1.02722049, "epoch": 0.19251465504283782, "flos": 20229234462720.0, "grad_norm": 2.270429026092637, "language_loss": 0.69541204, "learning_rate": 3.645554877900393e-06, "loss": 0.71674228, "num_input_tokens_seen": 69130280, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.6796875, "step": 3202, "time_per_iteration": 2.384735584259033 }, { "auxiliary_loss_clip": 0.01092863, "auxiliary_loss_mlp": 0.01032726, "balance_loss_clip": 1.01412892, "balance_loss_mlp": 1.02728343, "epoch": 0.1925747782955058, "flos": 19463344767360.0, "grad_norm": 2.5595947348008443, "language_loss": 0.91117144, "learning_rate": 3.645340108045995e-06, "loss": 0.93242729, "num_input_tokens_seen": 69149570, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.65625, "step": 3203, "time_per_iteration": 2.3753819465637207 }, { "auxiliary_loss_clip": 0.01094895, "auxiliary_loss_mlp": 0.01042478, "balance_loss_clip": 1.02277279, "balance_loss_mlp": 1.02658033, "epoch": 0.19263490154817375, "flos": 17784615997440.0, "grad_norm": 1.9669752936168026, "language_loss": 0.81680238, "learning_rate": 3.6451252794736417e-06, "loss": 0.83817607, "num_input_tokens_seen": 69168190, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.6796875, "step": 3204, "time_per_iteration": 2.368269920349121 }, { "auxiliary_loss_clip": 0.01092571, "auxiliary_loss_mlp": 0.01039142, "balance_loss_clip": 1.02041459, "balance_loss_mlp": 1.02673435, "epoch": 0.19269502480084172, "flos": 17456118215040.0, "grad_norm": 1.988349642468062, "language_loss": 0.75792122, "learning_rate": 3.6449103921909983e-06, "loss": 0.77923828, "num_input_tokens_seen": 69186950, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.66015625, "step": 3205, "time_per_iteration": 2.3562424182891846 }, { "auxiliary_loss_clip": 0.01096497, "auxiliary_loss_mlp": 0.01039308, "balance_loss_clip": 1.02030611, "balance_loss_mlp": 1.02881098, "epoch": 0.19275514805350968, "flos": 21505833659520.0, "grad_norm": 2.9249157336018707, "language_loss": 0.82801032, "learning_rate": 3.644695446205735e-06, "loss": 0.84936833, "num_input_tokens_seen": 69204850, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.67578125, "step": 3206, "time_per_iteration": 2.388993740081787 }, { "auxiliary_loss_clip": 0.01024912, "auxiliary_loss_mlp": 0.01007991, "balance_loss_clip": 1.00558305, "balance_loss_mlp": 1.00593722, "epoch": 0.19281527130617768, "flos": 47693379928320.0, "grad_norm": 0.8365671296608214, "language_loss": 0.60553396, "learning_rate": 3.644480441525521e-06, "loss": 0.62586296, "num_input_tokens_seen": 69259200, "router_z_loss_clip": 0.02404785, "router_z_loss_mlp": 0.18945312, "step": 3207, "time_per_iteration": 2.855283260345459 }, { "auxiliary_loss_clip": 0.01092823, "auxiliary_loss_mlp": 0.0103696, "balance_loss_clip": 1.01725471, "balance_loss_mlp": 1.02535605, "epoch": 0.19287539455884564, "flos": 11800467918720.0, "grad_norm": 5.028053307957577, "language_loss": 0.74671447, "learning_rate": 3.6442653781580305e-06, "loss": 0.76801234, "num_input_tokens_seen": 69275835, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.671875, "step": 3208, "time_per_iteration": 2.3509304523468018 }, { "auxiliary_loss_clip": 0.01094353, "auxiliary_loss_mlp": 0.01039326, "balance_loss_clip": 1.01995444, "balance_loss_mlp": 1.02553701, "epoch": 0.1929355178115136, "flos": 20630386517760.0, "grad_norm": 2.0997662987306325, "language_loss": 0.60876942, "learning_rate": 3.6440502561109384e-06, "loss": 0.63010621, "num_input_tokens_seen": 69294810, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.6875, "step": 3209, "time_per_iteration": 2.3633580207824707 }, { "auxiliary_loss_clip": 0.01096845, "auxiliary_loss_mlp": 0.01043453, "balance_loss_clip": 1.02256715, "balance_loss_mlp": 1.02674937, "epoch": 0.19299564106418157, "flos": 40806309467520.0, "grad_norm": 1.9447056824487978, "language_loss": 0.7999202, "learning_rate": 3.6438350753919213e-06, "loss": 0.82132316, "num_input_tokens_seen": 69316065, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.69921875, "step": 3210, "time_per_iteration": 2.5475172996520996 }, { "auxiliary_loss_clip": 0.01088278, "auxiliary_loss_mlp": 0.01037149, "balance_loss_clip": 1.01960135, "balance_loss_mlp": 1.02380741, "epoch": 0.19305576431684954, "flos": 11360702033280.0, "grad_norm": 2.2666304209789923, "language_loss": 0.82905734, "learning_rate": 3.643619836008659e-06, "loss": 0.85031164, "num_input_tokens_seen": 69332900, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.64453125, "step": 3211, "time_per_iteration": 2.3530471324920654 }, { "auxiliary_loss_clip": 0.01021493, "auxiliary_loss_mlp": 0.01002706, "balance_loss_clip": 1.00033367, "balance_loss_mlp": 1.00280476, "epoch": 0.1931158875695175, "flos": 54509299171200.0, "grad_norm": 0.9651194982063522, "language_loss": 0.63612223, "learning_rate": 3.6434045379688324e-06, "loss": 0.6563642, "num_input_tokens_seen": 69382535, "router_z_loss_clip": 0.02368164, "router_z_loss_mlp": 0.18652344, "step": 3212, "time_per_iteration": 2.9029247760772705 }, { "auxiliary_loss_clip": 0.01093502, "auxiliary_loss_mlp": 0.01041722, "balance_loss_clip": 1.02342355, "balance_loss_mlp": 1.02697301, "epoch": 0.19317601082218547, "flos": 19827419091840.0, "grad_norm": 1.7286706440858817, "language_loss": 0.76006323, "learning_rate": 3.6431891812801254e-06, "loss": 0.78141546, "num_input_tokens_seen": 69400600, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.6640625, "step": 3213, "time_per_iteration": 2.38496732711792 }, { "auxiliary_loss_clip": 0.01096553, "auxiliary_loss_mlp": 0.01041368, "balance_loss_clip": 1.02217579, "balance_loss_mlp": 1.02772832, "epoch": 0.19323613407485346, "flos": 13151222576640.0, "grad_norm": 2.0194626720413957, "language_loss": 0.71029568, "learning_rate": 3.6429737659502237e-06, "loss": 0.73167491, "num_input_tokens_seen": 69417350, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.6875, "step": 3214, "time_per_iteration": 3.7430498600006104 }, { "auxiliary_loss_clip": 0.01092736, "auxiliary_loss_mlp": 0.0103565, "balance_loss_clip": 1.01599288, "balance_loss_mlp": 1.02590609, "epoch": 0.19329625732752143, "flos": 14026390427520.0, "grad_norm": 2.04791721415141, "language_loss": 0.74819297, "learning_rate": 3.642758291986814e-06, "loss": 0.76947683, "num_input_tokens_seen": 69431845, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.66796875, "step": 3215, "time_per_iteration": 2.345534086227417 }, { "auxiliary_loss_clip": 0.01089553, "auxiliary_loss_mlp": 0.01040103, "balance_loss_clip": 1.02125573, "balance_loss_mlp": 1.02407169, "epoch": 0.1933563805801894, "flos": 23440580496000.0, "grad_norm": 3.8363575180347804, "language_loss": 0.88652748, "learning_rate": 3.642542759397587e-06, "loss": 0.90782398, "num_input_tokens_seen": 69453275, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.65625, "step": 3216, "time_per_iteration": 2.4154319763183594 }, { "auxiliary_loss_clip": 0.01092795, "auxiliary_loss_mlp": 0.01040408, "balance_loss_clip": 1.02185881, "balance_loss_mlp": 1.0269376, "epoch": 0.19341650383285736, "flos": 20484275011200.0, "grad_norm": 1.7398186088472865, "language_loss": 0.80092424, "learning_rate": 3.6423271681902336e-06, "loss": 0.82225633, "num_input_tokens_seen": 69471830, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.65625, "step": 3217, "time_per_iteration": 3.7727136611938477 }, { "auxiliary_loss_clip": 0.01096014, "auxiliary_loss_mlp": 0.01038216, "balance_loss_clip": 1.01750958, "balance_loss_mlp": 1.02631855, "epoch": 0.19347662708552532, "flos": 17857514649600.0, "grad_norm": 2.3762710878227398, "language_loss": 0.61644971, "learning_rate": 3.642111518372448e-06, "loss": 0.63779199, "num_input_tokens_seen": 69489320, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.6953125, "step": 3218, "time_per_iteration": 3.738217830657959 }, { "auxiliary_loss_clip": 0.01093309, "auxiliary_loss_mlp": 0.01041184, "balance_loss_clip": 1.0221107, "balance_loss_mlp": 1.02671599, "epoch": 0.1935367503381933, "flos": 18186256811520.0, "grad_norm": 1.9905440332339441, "language_loss": 0.80267423, "learning_rate": 3.6418958099519267e-06, "loss": 0.82401913, "num_input_tokens_seen": 69506665, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.6640625, "step": 3219, "time_per_iteration": 3.819187641143799 }, { "auxiliary_loss_clip": 0.01094091, "auxiliary_loss_mlp": 0.01043146, "balance_loss_clip": 1.02382195, "balance_loss_mlp": 1.02746654, "epoch": 0.19359687359086128, "flos": 15956319496320.0, "grad_norm": 2.4496600917349647, "language_loss": 0.85869569, "learning_rate": 3.6416800429363674e-06, "loss": 0.88006806, "num_input_tokens_seen": 69523835, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.66796875, "step": 3220, "time_per_iteration": 2.361116886138916 }, { "auxiliary_loss_clip": 0.01088917, "auxiliary_loss_mlp": 0.01035224, "balance_loss_clip": 1.01898813, "balance_loss_mlp": 1.02643895, "epoch": 0.19365699684352924, "flos": 21214134316800.0, "grad_norm": 3.4607468513524915, "language_loss": 0.84419346, "learning_rate": 3.6414642173334704e-06, "loss": 0.86543494, "num_input_tokens_seen": 69542620, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.625, "step": 3221, "time_per_iteration": 2.382223129272461 }, { "auxiliary_loss_clip": 0.01092355, "auxiliary_loss_mlp": 0.01036781, "balance_loss_clip": 1.01993644, "balance_loss_mlp": 1.0285064, "epoch": 0.1937171200961972, "flos": 17310146353920.0, "grad_norm": 2.191763726116518, "language_loss": 0.86122036, "learning_rate": 3.6412483331509373e-06, "loss": 0.88251173, "num_input_tokens_seen": 69561130, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.640625, "step": 3222, "time_per_iteration": 2.412787437438965 }, { "auxiliary_loss_clip": 0.0109185, "auxiliary_loss_mlp": 0.01032477, "balance_loss_clip": 1.01404715, "balance_loss_mlp": 1.02545762, "epoch": 0.19377724334886517, "flos": 22634924895360.0, "grad_norm": 2.525734028275212, "language_loss": 0.78428602, "learning_rate": 3.641032390396473e-06, "loss": 0.80552936, "num_input_tokens_seen": 69580425, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.6640625, "step": 3223, "time_per_iteration": 2.4569711685180664 }, { "auxiliary_loss_clip": 0.01090868, "auxiliary_loss_mlp": 0.01034828, "balance_loss_clip": 1.01762629, "balance_loss_mlp": 1.02666807, "epoch": 0.19383736660153314, "flos": 15077136839040.0, "grad_norm": 2.1340963987027926, "language_loss": 0.75416589, "learning_rate": 3.6408163890777843e-06, "loss": 0.77542287, "num_input_tokens_seen": 69597085, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.640625, "step": 3224, "time_per_iteration": 2.358529567718506 }, { "auxiliary_loss_clip": 0.01090426, "auxiliary_loss_mlp": 0.01031402, "balance_loss_clip": 1.01259124, "balance_loss_mlp": 1.02646494, "epoch": 0.1938974898542011, "flos": 47118152367360.0, "grad_norm": 2.2078840050057473, "language_loss": 0.70660877, "learning_rate": 3.640600329202579e-06, "loss": 0.72782701, "num_input_tokens_seen": 69618885, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.640625, "step": 3225, "time_per_iteration": 2.617093563079834 }, { "auxiliary_loss_clip": 0.01090086, "auxiliary_loss_mlp": 0.01033062, "balance_loss_clip": 1.01528764, "balance_loss_mlp": 1.02510476, "epoch": 0.19395761310686907, "flos": 25811357702400.0, "grad_norm": 2.4472171369842837, "language_loss": 0.69760823, "learning_rate": 3.6403842107785686e-06, "loss": 0.71883965, "num_input_tokens_seen": 69638200, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.6484375, "step": 3226, "time_per_iteration": 2.4120709896087646 }, { "auxiliary_loss_clip": 0.01091227, "auxiliary_loss_mlp": 0.01039145, "balance_loss_clip": 1.02029812, "balance_loss_mlp": 1.02752805, "epoch": 0.19401773635953706, "flos": 23038485834240.0, "grad_norm": 1.6591788837545542, "language_loss": 0.76039732, "learning_rate": 3.6401680338134653e-06, "loss": 0.78170109, "num_input_tokens_seen": 69657550, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.63671875, "step": 3227, "time_per_iteration": 2.3980982303619385 }, { "auxiliary_loss_clip": 0.01090895, "auxiliary_loss_mlp": 0.01040711, "balance_loss_clip": 1.02276993, "balance_loss_mlp": 1.02430868, "epoch": 0.19407785961220503, "flos": 15919974904320.0, "grad_norm": 1.8649498984843145, "language_loss": 0.69280744, "learning_rate": 3.6399517983149838e-06, "loss": 0.71412349, "num_input_tokens_seen": 69675005, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.66796875, "step": 3228, "time_per_iteration": 2.3586676120758057 }, { "auxiliary_loss_clip": 0.01091473, "auxiliary_loss_mlp": 0.01043592, "balance_loss_clip": 1.02537704, "balance_loss_mlp": 1.02686977, "epoch": 0.194137982864873, "flos": 25920531123840.0, "grad_norm": 2.220200115653601, "language_loss": 0.74391913, "learning_rate": 3.6397355042908407e-06, "loss": 0.76526976, "num_input_tokens_seen": 69696455, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.64453125, "step": 3229, "time_per_iteration": 2.4335381984710693 }, { "auxiliary_loss_clip": 0.01091645, "auxiliary_loss_mlp": 0.01033087, "balance_loss_clip": 1.01598048, "balance_loss_mlp": 1.02630305, "epoch": 0.19419810611754096, "flos": 13260500732160.0, "grad_norm": 2.4094008360195143, "language_loss": 0.65313721, "learning_rate": 3.6395191517487557e-06, "loss": 0.67438447, "num_input_tokens_seen": 69714245, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.65234375, "step": 3230, "time_per_iteration": 2.3724405765533447 }, { "auxiliary_loss_clip": 0.01089027, "auxiliary_loss_mlp": 0.01036275, "balance_loss_clip": 1.01831055, "balance_loss_mlp": 1.02513218, "epoch": 0.19425822937020892, "flos": 15704665349760.0, "grad_norm": 1.9126845229967357, "language_loss": 0.82243401, "learning_rate": 3.6393027406964494e-06, "loss": 0.84368706, "num_input_tokens_seen": 69731515, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.640625, "step": 3231, "time_per_iteration": 2.372706174850464 }, { "auxiliary_loss_clip": 0.01093246, "auxiliary_loss_mlp": 0.01034572, "balance_loss_clip": 1.01459241, "balance_loss_mlp": 1.02733052, "epoch": 0.1943183526228769, "flos": 23104472037120.0, "grad_norm": 1.882908253566389, "language_loss": 0.87074304, "learning_rate": 3.639086271141645e-06, "loss": 0.89202118, "num_input_tokens_seen": 69748885, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.66015625, "step": 3232, "time_per_iteration": 2.3734681606292725 }, { "auxiliary_loss_clip": 0.01092436, "auxiliary_loss_mlp": 0.01037586, "balance_loss_clip": 1.01937103, "balance_loss_mlp": 1.02718019, "epoch": 0.19437847587554485, "flos": 24711593875200.0, "grad_norm": 1.7653542276973573, "language_loss": 0.85239351, "learning_rate": 3.6388697430920674e-06, "loss": 0.8736937, "num_input_tokens_seen": 69767540, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.65234375, "step": 3233, "time_per_iteration": 2.4153127670288086 }, { "auxiliary_loss_clip": 0.01093505, "auxiliary_loss_mlp": 0.01042232, "balance_loss_clip": 1.02377868, "balance_loss_mlp": 1.02482581, "epoch": 0.19443859912821285, "flos": 23114910533760.0, "grad_norm": 1.7319351285692142, "language_loss": 0.88985711, "learning_rate": 3.638653156555445e-06, "loss": 0.91121447, "num_input_tokens_seen": 69789340, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.6875, "step": 3234, "time_per_iteration": 2.4098269939422607 }, { "auxiliary_loss_clip": 0.01091998, "auxiliary_loss_mlp": 0.01033936, "balance_loss_clip": 1.01485085, "balance_loss_mlp": 1.02412546, "epoch": 0.1944987223808808, "flos": 15083525617920.0, "grad_norm": 5.7409567116605515, "language_loss": 0.78201854, "learning_rate": 3.638436511539507e-06, "loss": 0.80327791, "num_input_tokens_seen": 69806470, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.6796875, "step": 3235, "time_per_iteration": 2.35680890083313 }, { "auxiliary_loss_clip": 0.01092563, "auxiliary_loss_mlp": 0.01034089, "balance_loss_clip": 1.01667261, "balance_loss_mlp": 1.02676952, "epoch": 0.19455884563354878, "flos": 17125979598720.0, "grad_norm": 1.953177095274907, "language_loss": 0.79242563, "learning_rate": 3.6382198080519833e-06, "loss": 0.81369209, "num_input_tokens_seen": 69822655, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.66015625, "step": 3236, "time_per_iteration": 2.345677614212036 }, { "auxiliary_loss_clip": 0.01090901, "auxiliary_loss_mlp": 0.01037931, "balance_loss_clip": 1.01946533, "balance_loss_mlp": 1.02475131, "epoch": 0.19461896888621674, "flos": 20192366200320.0, "grad_norm": 1.503568861209779, "language_loss": 0.7555871, "learning_rate": 3.6380030461006093e-06, "loss": 0.77687538, "num_input_tokens_seen": 69841895, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.6640625, "step": 3237, "time_per_iteration": 2.3915979862213135 }, { "auxiliary_loss_clip": 0.01092935, "auxiliary_loss_mlp": 0.01038811, "balance_loss_clip": 1.02026165, "balance_loss_mlp": 1.02521873, "epoch": 0.1946790921388847, "flos": 25300194353280.0, "grad_norm": 1.5232704083913822, "language_loss": 0.75017565, "learning_rate": 3.6377862256931203e-06, "loss": 0.77149314, "num_input_tokens_seen": 69862220, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.67578125, "step": 3238, "time_per_iteration": 2.416321039199829 }, { "auxiliary_loss_clip": 0.01094228, "auxiliary_loss_mlp": 0.01040916, "balance_loss_clip": 1.02069783, "balance_loss_mlp": 1.02699256, "epoch": 0.19473921539155267, "flos": 20192366200320.0, "grad_norm": 1.9690252501932926, "language_loss": 0.73038596, "learning_rate": 3.637569346837253e-06, "loss": 0.75173736, "num_input_tokens_seen": 69881830, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.671875, "step": 3239, "time_per_iteration": 2.3719887733459473 }, { "auxiliary_loss_clip": 0.01091792, "auxiliary_loss_mlp": 0.01038523, "balance_loss_clip": 1.02036738, "balance_loss_mlp": 1.02522206, "epoch": 0.19479933864422067, "flos": 20886474407040.0, "grad_norm": 1.7556989267650014, "language_loss": 0.7344541, "learning_rate": 3.6373524095407485e-06, "loss": 0.75575721, "num_input_tokens_seen": 69900515, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.66796875, "step": 3240, "time_per_iteration": 2.3777613639831543 }, { "auxiliary_loss_clip": 0.01091297, "auxiliary_loss_mlp": 0.01034383, "balance_loss_clip": 1.01701427, "balance_loss_mlp": 1.02552462, "epoch": 0.19485946189688863, "flos": 23293945319040.0, "grad_norm": 1.9485447126523352, "language_loss": 0.66402727, "learning_rate": 3.637135413811348e-06, "loss": 0.68528414, "num_input_tokens_seen": 69920060, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.65625, "step": 3241, "time_per_iteration": 2.399484395980835 }, { "auxiliary_loss_clip": 0.0109156, "auxiliary_loss_mlp": 0.01037781, "balance_loss_clip": 1.01962543, "balance_loss_mlp": 1.02629566, "epoch": 0.1949195851495566, "flos": 23293910407680.0, "grad_norm": 1.9349221812557778, "language_loss": 0.8284806, "learning_rate": 3.636918359656796e-06, "loss": 0.84977406, "num_input_tokens_seen": 69939820, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.65234375, "step": 3242, "time_per_iteration": 2.4018092155456543 }, { "auxiliary_loss_clip": 0.01023619, "auxiliary_loss_mlp": 0.01003701, "balance_loss_clip": 1.00105453, "balance_loss_mlp": 1.00448895, "epoch": 0.19497970840222456, "flos": 64959536102400.0, "grad_norm": 0.8196101855354372, "language_loss": 0.57456034, "learning_rate": 3.636701247084839e-06, "loss": 0.59483355, "num_input_tokens_seen": 70002145, "router_z_loss_clip": 0.02648926, "router_z_loss_mlp": 0.19140625, "step": 3243, "time_per_iteration": 3.0555503368377686 }, { "auxiliary_loss_clip": 0.01095224, "auxiliary_loss_mlp": 0.01039058, "balance_loss_clip": 1.0204618, "balance_loss_mlp": 1.0275383, "epoch": 0.19503983165489253, "flos": 19643741095680.0, "grad_norm": 2.016429930690371, "language_loss": 0.83302236, "learning_rate": 3.6364840761032238e-06, "loss": 0.85436511, "num_input_tokens_seen": 70020510, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.67578125, "step": 3244, "time_per_iteration": 2.3813095092773438 }, { "auxiliary_loss_clip": 0.01094253, "auxiliary_loss_mlp": 0.01035704, "balance_loss_clip": 1.01697576, "balance_loss_mlp": 1.02831161, "epoch": 0.1950999549075605, "flos": 21140921462400.0, "grad_norm": 1.6515982101360513, "language_loss": 0.7687943, "learning_rate": 3.6362668467197015e-06, "loss": 0.7900939, "num_input_tokens_seen": 70040760, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.65625, "step": 3245, "time_per_iteration": 2.3906078338623047 }, { "auxiliary_loss_clip": 0.01093757, "auxiliary_loss_mlp": 0.01038342, "balance_loss_clip": 1.01860094, "balance_loss_mlp": 1.02652812, "epoch": 0.19516007816022846, "flos": 20883821143680.0, "grad_norm": 1.9215117662172279, "language_loss": 0.84480739, "learning_rate": 3.6360495589420247e-06, "loss": 0.86612833, "num_input_tokens_seen": 70058720, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.671875, "step": 3246, "time_per_iteration": 2.376635789871216 }, { "auxiliary_loss_clip": 0.0109496, "auxiliary_loss_mlp": 0.01038014, "balance_loss_clip": 1.01931024, "balance_loss_mlp": 1.02649212, "epoch": 0.19522020141289645, "flos": 16909552880640.0, "grad_norm": 2.0241418147502372, "language_loss": 0.75633973, "learning_rate": 3.6358322127779476e-06, "loss": 0.77766943, "num_input_tokens_seen": 70076470, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.68359375, "step": 3247, "time_per_iteration": 2.3652443885803223 }, { "auxiliary_loss_clip": 0.01097248, "auxiliary_loss_mlp": 0.01035461, "balance_loss_clip": 1.01579106, "balance_loss_mlp": 1.02903318, "epoch": 0.19528032466556441, "flos": 26723603283840.0, "grad_norm": 1.8681365272471933, "language_loss": 0.75390351, "learning_rate": 3.6356148082352265e-06, "loss": 0.77523059, "num_input_tokens_seen": 70096220, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.6796875, "step": 3248, "time_per_iteration": 2.444396495819092 }, { "auxiliary_loss_clip": 0.01093291, "auxiliary_loss_mlp": 0.01037908, "balance_loss_clip": 1.018417, "balance_loss_mlp": 1.0265578, "epoch": 0.19534044791823238, "flos": 21031748040960.0, "grad_norm": 2.045327479393052, "language_loss": 0.78500307, "learning_rate": 3.63539734532162e-06, "loss": 0.80631506, "num_input_tokens_seen": 70114800, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.6640625, "step": 3249, "time_per_iteration": 2.3759329319000244 }, { "auxiliary_loss_clip": 0.0109475, "auxiliary_loss_mlp": 0.01036823, "balance_loss_clip": 1.01817822, "balance_loss_mlp": 1.02709186, "epoch": 0.19540057117090034, "flos": 22343016084480.0, "grad_norm": 1.5481371088008538, "language_loss": 0.72917652, "learning_rate": 3.6351798240448894e-06, "loss": 0.75049222, "num_input_tokens_seen": 70134930, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.67578125, "step": 3250, "time_per_iteration": 2.3821792602539062 }, { "auxiliary_loss_clip": 0.01091139, "auxiliary_loss_mlp": 0.01036574, "balance_loss_clip": 1.01860952, "balance_loss_mlp": 1.02606761, "epoch": 0.1954606944235683, "flos": 20300631926400.0, "grad_norm": 2.077856386507019, "language_loss": 0.79381561, "learning_rate": 3.634962244412797e-06, "loss": 0.8150928, "num_input_tokens_seen": 70152045, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.65234375, "step": 3251, "time_per_iteration": 2.3681230545043945 }, { "auxiliary_loss_clip": 0.01094269, "auxiliary_loss_mlp": 0.01043514, "balance_loss_clip": 1.02563214, "balance_loss_mlp": 1.02753401, "epoch": 0.19552081767623627, "flos": 17345932364160.0, "grad_norm": 4.307334667439784, "language_loss": 0.83700019, "learning_rate": 3.6347446064331074e-06, "loss": 0.85837805, "num_input_tokens_seen": 70169240, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.66796875, "step": 3252, "time_per_iteration": 2.3527212142944336 }, { "auxiliary_loss_clip": 0.01096735, "auxiliary_loss_mlp": 0.01046693, "balance_loss_clip": 1.02541399, "balance_loss_mlp": 1.02695727, "epoch": 0.19558094092890424, "flos": 31976286134400.0, "grad_norm": 1.8757880103444917, "language_loss": 0.73458648, "learning_rate": 3.6345269101135885e-06, "loss": 0.75602067, "num_input_tokens_seen": 70192690, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.69921875, "step": 3253, "time_per_iteration": 2.4764764308929443 }, { "auxiliary_loss_clip": 0.01094574, "auxiliary_loss_mlp": 0.01037477, "balance_loss_clip": 1.01700842, "balance_loss_mlp": 1.02562308, "epoch": 0.19564106418157223, "flos": 22267918016640.0, "grad_norm": 1.850358038220547, "language_loss": 0.76417327, "learning_rate": 3.634309155462008e-06, "loss": 0.78549385, "num_input_tokens_seen": 70209685, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.6875, "step": 3254, "time_per_iteration": 3.743779182434082 }, { "auxiliary_loss_clip": 0.01020729, "auxiliary_loss_mlp": 0.01004913, "balance_loss_clip": 1.00242162, "balance_loss_mlp": 1.00204873, "epoch": 0.1957011874342402, "flos": 54362000678400.0, "grad_norm": 0.7573270102413824, "language_loss": 0.55256647, "learning_rate": 3.6340913424861383e-06, "loss": 0.57282287, "num_input_tokens_seen": 70265050, "router_z_loss_clip": 0.02490234, "router_z_loss_mlp": 0.18652344, "step": 3255, "time_per_iteration": 2.994016408920288 }, { "auxiliary_loss_clip": 0.01096999, "auxiliary_loss_mlp": 0.01036695, "balance_loss_clip": 1.01614356, "balance_loss_mlp": 1.02813172, "epoch": 0.19576131068690816, "flos": 16505817384960.0, "grad_norm": 2.7410325008516776, "language_loss": 0.70526785, "learning_rate": 3.6338734711937512e-06, "loss": 0.72660476, "num_input_tokens_seen": 70281830, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.6875, "step": 3256, "time_per_iteration": 3.76116943359375 }, { "auxiliary_loss_clip": 0.01091378, "auxiliary_loss_mlp": 0.01036129, "balance_loss_clip": 1.01678145, "balance_loss_mlp": 1.02586532, "epoch": 0.19582143393957613, "flos": 14718822888960.0, "grad_norm": 3.4934437635727464, "language_loss": 0.80128163, "learning_rate": 3.6336555415926232e-06, "loss": 0.82255673, "num_input_tokens_seen": 70297420, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.65625, "step": 3257, "time_per_iteration": 3.6858999729156494 }, { "auxiliary_loss_clip": 0.01093835, "auxiliary_loss_mlp": 0.01036658, "balance_loss_clip": 1.01727438, "balance_loss_mlp": 1.02637625, "epoch": 0.1958815571922441, "flos": 24424363186560.0, "grad_norm": 1.9926092582446306, "language_loss": 0.74545258, "learning_rate": 3.6334375536905313e-06, "loss": 0.76675749, "num_input_tokens_seen": 70319210, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.67578125, "step": 3258, "time_per_iteration": 2.413703203201294 }, { "auxiliary_loss_clip": 0.01095093, "auxiliary_loss_mlp": 0.01035571, "balance_loss_clip": 1.01610398, "balance_loss_mlp": 1.02671051, "epoch": 0.19594168044491206, "flos": 24899112120960.0, "grad_norm": 1.9569839831859468, "language_loss": 0.74006474, "learning_rate": 3.633219507495255e-06, "loss": 0.76137137, "num_input_tokens_seen": 70339045, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.68359375, "step": 3259, "time_per_iteration": 3.779968500137329 }, { "auxiliary_loss_clip": 0.01097543, "auxiliary_loss_mlp": 0.01043504, "balance_loss_clip": 1.0222733, "balance_loss_mlp": 1.02805924, "epoch": 0.19600180369758005, "flos": 12056206694400.0, "grad_norm": 2.537177333310713, "language_loss": 0.76501352, "learning_rate": 3.633001403014575e-06, "loss": 0.78642392, "num_input_tokens_seen": 70356505, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 0.6953125, "step": 3260, "time_per_iteration": 2.352241039276123 }, { "auxiliary_loss_clip": 0.01094873, "auxiliary_loss_mlp": 0.01039222, "balance_loss_clip": 1.01876605, "balance_loss_mlp": 1.02674258, "epoch": 0.19606192695024802, "flos": 20849152296960.0, "grad_norm": 2.0974309113083542, "language_loss": 0.82169342, "learning_rate": 3.632783240256276e-06, "loss": 0.84303439, "num_input_tokens_seen": 70375410, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.6796875, "step": 3261, "time_per_iteration": 2.386594295501709 }, { "auxiliary_loss_clip": 0.0109362, "auxiliary_loss_mlp": 0.01039481, "balance_loss_clip": 1.01932275, "balance_loss_mlp": 1.02704477, "epoch": 0.19612205020291598, "flos": 28474253187840.0, "grad_norm": 2.2523880615555, "language_loss": 0.76350236, "learning_rate": 3.632565019228143e-06, "loss": 0.78483337, "num_input_tokens_seen": 70396315, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.6640625, "step": 3262, "time_per_iteration": 2.4290218353271484 }, { "auxiliary_loss_clip": 0.01097967, "auxiliary_loss_mlp": 0.01042585, "balance_loss_clip": 1.02332032, "balance_loss_mlp": 1.02915668, "epoch": 0.19618217345558395, "flos": 25555444369920.0, "grad_norm": 1.6467314770588994, "language_loss": 0.86481088, "learning_rate": 3.6323467399379634e-06, "loss": 0.8862164, "num_input_tokens_seen": 70417945, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.6875, "step": 3263, "time_per_iteration": 2.4395463466644287 }, { "auxiliary_loss_clip": 0.01092486, "auxiliary_loss_mlp": 0.01035799, "balance_loss_clip": 1.01756024, "balance_loss_mlp": 1.02606213, "epoch": 0.1962422967082519, "flos": 25263256268160.0, "grad_norm": 1.672456435112228, "language_loss": 0.73797274, "learning_rate": 3.6321284023935284e-06, "loss": 0.75925559, "num_input_tokens_seen": 70438690, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.6640625, "step": 3264, "time_per_iteration": 2.4126996994018555 }, { "auxiliary_loss_clip": 0.01093631, "auxiliary_loss_mlp": 0.01035924, "balance_loss_clip": 1.01721966, "balance_loss_mlp": 1.02846122, "epoch": 0.19630241996091988, "flos": 18806349202560.0, "grad_norm": 1.8398535179686513, "language_loss": 0.78879499, "learning_rate": 3.6319100066026284e-06, "loss": 0.81009054, "num_input_tokens_seen": 70455385, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.65234375, "step": 3265, "time_per_iteration": 2.3726553916931152 }, { "auxiliary_loss_clip": 0.01020868, "auxiliary_loss_mlp": 0.01002098, "balance_loss_clip": 0.99966645, "balance_loss_mlp": 1.00293803, "epoch": 0.19636254321358784, "flos": 62318287526400.0, "grad_norm": 0.7818136536489693, "language_loss": 0.53380704, "learning_rate": 3.6316915525730586e-06, "loss": 0.55403674, "num_input_tokens_seen": 70514280, "router_z_loss_clip": 0.02429199, "router_z_loss_mlp": 0.1796875, "step": 3266, "time_per_iteration": 3.045663833618164 }, { "auxiliary_loss_clip": 0.01097612, "auxiliary_loss_mlp": 0.010397, "balance_loss_clip": 1.01870728, "balance_loss_mlp": 1.02730501, "epoch": 0.19642266646625584, "flos": 21068267189760.0, "grad_norm": 1.995501135005288, "language_loss": 0.80237895, "learning_rate": 3.631473040312614e-06, "loss": 0.82375205, "num_input_tokens_seen": 70531800, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.703125, "step": 3267, "time_per_iteration": 2.3809406757354736 }, { "auxiliary_loss_clip": 0.010922, "auxiliary_loss_mlp": 0.01036523, "balance_loss_clip": 1.01773524, "balance_loss_mlp": 1.02631319, "epoch": 0.1964827897189238, "flos": 14537763244800.0, "grad_norm": 9.820342443007977, "language_loss": 0.86726725, "learning_rate": 3.631254469829094e-06, "loss": 0.88855445, "num_input_tokens_seen": 70550615, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.66015625, "step": 3268, "time_per_iteration": 2.3497135639190674 }, { "auxiliary_loss_clip": 0.01094062, "auxiliary_loss_mlp": 0.01034729, "balance_loss_clip": 1.01679969, "balance_loss_mlp": 1.02803731, "epoch": 0.19654291297159177, "flos": 19243636381440.0, "grad_norm": 2.55200700436689, "language_loss": 0.69314349, "learning_rate": 3.631035841130297e-06, "loss": 0.71443152, "num_input_tokens_seen": 70568690, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.66015625, "step": 3269, "time_per_iteration": 2.382796049118042 }, { "auxiliary_loss_clip": 0.01098343, "auxiliary_loss_mlp": 0.01038692, "balance_loss_clip": 1.019189, "balance_loss_mlp": 1.02958822, "epoch": 0.19660303622425973, "flos": 25774524351360.0, "grad_norm": 2.125134277645011, "language_loss": 0.80778444, "learning_rate": 3.6308171542240273e-06, "loss": 0.82915473, "num_input_tokens_seen": 70588665, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.6875, "step": 3270, "time_per_iteration": 2.418290376663208 }, { "auxiliary_loss_clip": 0.01090844, "auxiliary_loss_mlp": 0.01036312, "balance_loss_clip": 1.01775169, "balance_loss_mlp": 1.02518535, "epoch": 0.1966631594769277, "flos": 20594041925760.0, "grad_norm": 2.3224232715776933, "language_loss": 0.83714098, "learning_rate": 3.6305984091180875e-06, "loss": 0.8584125, "num_input_tokens_seen": 70606900, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.65625, "step": 3271, "time_per_iteration": 2.3715765476226807 }, { "auxiliary_loss_clip": 0.01090027, "auxiliary_loss_mlp": 0.01035944, "balance_loss_clip": 1.01740742, "balance_loss_mlp": 1.026191, "epoch": 0.19672328272959566, "flos": 23622059076480.0, "grad_norm": 1.9779680712166663, "language_loss": 0.8020243, "learning_rate": 3.630379605820286e-06, "loss": 0.82328403, "num_input_tokens_seen": 70625955, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.640625, "step": 3272, "time_per_iteration": 2.3996176719665527 }, { "auxiliary_loss_clip": 0.01094305, "auxiliary_loss_mlp": 0.0104103, "balance_loss_clip": 1.02173042, "balance_loss_mlp": 1.0272423, "epoch": 0.19678340598226365, "flos": 23109848386560.0, "grad_norm": 1.9417672352006365, "language_loss": 0.80638385, "learning_rate": 3.630160744338429e-06, "loss": 0.82773721, "num_input_tokens_seen": 70646090, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.671875, "step": 3273, "time_per_iteration": 2.4034245014190674 }, { "auxiliary_loss_clip": 0.01093358, "auxiliary_loss_mlp": 0.01042803, "balance_loss_clip": 1.02361071, "balance_loss_mlp": 1.02661026, "epoch": 0.19684352923493162, "flos": 24533711164800.0, "grad_norm": 1.685327645217058, "language_loss": 0.77463973, "learning_rate": 3.6299418246803287e-06, "loss": 0.79600132, "num_input_tokens_seen": 70666065, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.66796875, "step": 3274, "time_per_iteration": 2.4098117351531982 }, { "auxiliary_loss_clip": 0.01093006, "auxiliary_loss_mlp": 0.01038849, "balance_loss_clip": 1.01897693, "balance_loss_mlp": 1.02611113, "epoch": 0.19690365248759958, "flos": 21795438320640.0, "grad_norm": 3.613819678474458, "language_loss": 0.81330287, "learning_rate": 3.6297228468537976e-06, "loss": 0.83462143, "num_input_tokens_seen": 70681580, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.66796875, "step": 3275, "time_per_iteration": 2.3586437702178955 }, { "auxiliary_loss_clip": 0.01093968, "auxiliary_loss_mlp": 0.01039103, "balance_loss_clip": 1.01931393, "balance_loss_mlp": 1.02676809, "epoch": 0.19696377574026755, "flos": 19055803933440.0, "grad_norm": 1.8388194076600255, "language_loss": 0.81185746, "learning_rate": 3.6295038108666504e-06, "loss": 0.83318818, "num_input_tokens_seen": 70697745, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.671875, "step": 3276, "time_per_iteration": 2.3627398014068604 }, { "auxiliary_loss_clip": 0.01093263, "auxiliary_loss_mlp": 0.0103276, "balance_loss_clip": 1.01329339, "balance_loss_mlp": 1.02681422, "epoch": 0.19702389899293551, "flos": 22819545498240.0, "grad_norm": 3.913293022738735, "language_loss": 0.89339715, "learning_rate": 3.629284716726703e-06, "loss": 0.91465741, "num_input_tokens_seen": 70715110, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.6640625, "step": 3277, "time_per_iteration": 2.3836357593536377 }, { "auxiliary_loss_clip": 0.01096947, "auxiliary_loss_mlp": 0.0104411, "balance_loss_clip": 1.02131701, "balance_loss_mlp": 1.02706742, "epoch": 0.19708402224560348, "flos": 22893107466240.0, "grad_norm": 2.121847641070351, "language_loss": 0.62500441, "learning_rate": 3.6290655644417757e-06, "loss": 0.646415, "num_input_tokens_seen": 70734715, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 0.69921875, "step": 3278, "time_per_iteration": 2.398437738418579 }, { "auxiliary_loss_clip": 0.01094952, "auxiliary_loss_mlp": 0.01041973, "balance_loss_clip": 1.02227986, "balance_loss_mlp": 1.02898657, "epoch": 0.19714414549827144, "flos": 25661440857600.0, "grad_norm": 2.9479425926662484, "language_loss": 0.73167086, "learning_rate": 3.6288463540196894e-06, "loss": 0.75304008, "num_input_tokens_seen": 70752650, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.66015625, "step": 3279, "time_per_iteration": 2.409766912460327 }, { "auxiliary_loss_clip": 0.01094849, "auxiliary_loss_mlp": 0.01036963, "balance_loss_clip": 1.01818752, "balance_loss_mlp": 1.02610898, "epoch": 0.19720426875093944, "flos": 23914666114560.0, "grad_norm": 1.6378735440973151, "language_loss": 0.8245886, "learning_rate": 3.6286270854682654e-06, "loss": 0.84590667, "num_input_tokens_seen": 70772365, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.6875, "step": 3280, "time_per_iteration": 2.402646780014038 }, { "auxiliary_loss_clip": 0.01096637, "auxiliary_loss_mlp": 0.01039202, "balance_loss_clip": 1.02074862, "balance_loss_mlp": 1.02819932, "epoch": 0.1972643920036074, "flos": 13881081882240.0, "grad_norm": 1.9159719125239376, "language_loss": 0.77710402, "learning_rate": 3.6284077587953307e-06, "loss": 0.79846239, "num_input_tokens_seen": 70790340, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.68359375, "step": 3281, "time_per_iteration": 2.359015703201294 }, { "auxiliary_loss_clip": 0.01091432, "auxiliary_loss_mlp": 0.0104029, "balance_loss_clip": 1.02223015, "balance_loss_mlp": 1.02704263, "epoch": 0.19732451525627537, "flos": 19862611608960.0, "grad_norm": 1.7886278430035771, "language_loss": 0.79787135, "learning_rate": 3.628188374008712e-06, "loss": 0.81918859, "num_input_tokens_seen": 70809295, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.64453125, "step": 3282, "time_per_iteration": 2.4334022998809814 }, { "auxiliary_loss_clip": 0.01097752, "auxiliary_loss_mlp": 0.01036859, "balance_loss_clip": 1.01765406, "balance_loss_mlp": 1.02917266, "epoch": 0.19738463850894333, "flos": 24972255152640.0, "grad_norm": 2.004615061425067, "language_loss": 0.71514744, "learning_rate": 3.6279689311162382e-06, "loss": 0.73649353, "num_input_tokens_seen": 70828765, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.68359375, "step": 3283, "time_per_iteration": 2.411259412765503 }, { "auxiliary_loss_clip": 0.01093562, "auxiliary_loss_mlp": 0.010486, "balance_loss_clip": 1.02915692, "balance_loss_mlp": 1.02674568, "epoch": 0.1974447617616113, "flos": 18367909948800.0, "grad_norm": 2.0901111004947532, "language_loss": 0.78843147, "learning_rate": 3.6277494301257407e-06, "loss": 0.80985308, "num_input_tokens_seen": 70846805, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.66796875, "step": 3284, "time_per_iteration": 2.363900661468506 }, { "auxiliary_loss_clip": 0.01095873, "auxiliary_loss_mlp": 0.01040929, "balance_loss_clip": 1.01948333, "balance_loss_mlp": 1.02663827, "epoch": 0.19750488501427926, "flos": 22891850657280.0, "grad_norm": 2.1448640694948136, "language_loss": 0.86014587, "learning_rate": 3.6275298710450533e-06, "loss": 0.88151383, "num_input_tokens_seen": 70863805, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.69140625, "step": 3285, "time_per_iteration": 2.3824238777160645 }, { "auxiliary_loss_clip": 0.01092925, "auxiliary_loss_mlp": 0.01034811, "balance_loss_clip": 1.01686954, "balance_loss_mlp": 1.0281477, "epoch": 0.19756500826694723, "flos": 21870431654400.0, "grad_norm": 2.307979665706718, "language_loss": 0.88518846, "learning_rate": 3.627310253882012e-06, "loss": 0.90646577, "num_input_tokens_seen": 70882660, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.6484375, "step": 3286, "time_per_iteration": 2.3906936645507812 }, { "auxiliary_loss_clip": 0.01095962, "auxiliary_loss_mlp": 0.01041885, "balance_loss_clip": 1.02093983, "balance_loss_mlp": 1.02850592, "epoch": 0.19762513151961522, "flos": 15158065104000.0, "grad_norm": 2.3708893620062006, "language_loss": 0.78196716, "learning_rate": 3.627090578644452e-06, "loss": 0.80334568, "num_input_tokens_seen": 70898765, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.671875, "step": 3287, "time_per_iteration": 2.351134777069092 }, { "auxiliary_loss_clip": 0.01095035, "auxiliary_loss_mlp": 0.0103711, "balance_loss_clip": 1.01710689, "balance_loss_mlp": 1.02704477, "epoch": 0.1976852547722832, "flos": 16978331992320.0, "grad_norm": 6.362389513168747, "language_loss": 0.81321955, "learning_rate": 3.6268708453402163e-06, "loss": 0.83454096, "num_input_tokens_seen": 70916370, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.6796875, "step": 3288, "time_per_iteration": 2.357706308364868 }, { "auxiliary_loss_clip": 0.01091467, "auxiliary_loss_mlp": 0.01039245, "balance_loss_clip": 1.02087522, "balance_loss_mlp": 1.02781177, "epoch": 0.19774537802495115, "flos": 20301888735360.0, "grad_norm": 1.8304637570278746, "language_loss": 0.72829801, "learning_rate": 3.626651053977144e-06, "loss": 0.74960506, "num_input_tokens_seen": 70934870, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.63671875, "step": 3289, "time_per_iteration": 2.3803505897521973 }, { "auxiliary_loss_clip": 0.01093329, "auxiliary_loss_mlp": 0.01041591, "balance_loss_clip": 1.02309, "balance_loss_mlp": 1.02678442, "epoch": 0.19780550127761912, "flos": 27234242962560.0, "grad_norm": 2.524989848506488, "language_loss": 0.7940982, "learning_rate": 3.6264312045630802e-06, "loss": 0.81544745, "num_input_tokens_seen": 70955140, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.6640625, "step": 3290, "time_per_iteration": 2.4267165660858154 }, { "auxiliary_loss_clip": 0.01023181, "auxiliary_loss_mlp": 0.01002361, "balance_loss_clip": 0.9999764, "balance_loss_mlp": 1.00474858, "epoch": 0.19786562453028708, "flos": 63547368629760.0, "grad_norm": 0.8859536292225495, "language_loss": 0.60287488, "learning_rate": 3.62621129710587e-06, "loss": 0.6231302, "num_input_tokens_seen": 71012005, "router_z_loss_clip": 0.02380371, "router_z_loss_mlp": 0.18457031, "step": 3291, "time_per_iteration": 3.0441205501556396 }, { "auxiliary_loss_clip": 0.01095104, "auxiliary_loss_mlp": 0.01036338, "balance_loss_clip": 1.01521397, "balance_loss_mlp": 1.02619529, "epoch": 0.19792574778295505, "flos": 26285443320960.0, "grad_norm": 1.710016469752574, "language_loss": 0.81068504, "learning_rate": 3.6259913316133625e-06, "loss": 0.83199942, "num_input_tokens_seen": 71031140, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.6875, "step": 3292, "time_per_iteration": 2.4184789657592773 }, { "auxiliary_loss_clip": 0.01088314, "auxiliary_loss_mlp": 0.01036573, "balance_loss_clip": 1.01852489, "balance_loss_mlp": 1.02581763, "epoch": 0.19798587103562304, "flos": 19937081272320.0, "grad_norm": 2.164269928396593, "language_loss": 0.81474257, "learning_rate": 3.625771308093406e-06, "loss": 0.8359915, "num_input_tokens_seen": 71050250, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.625, "step": 3293, "time_per_iteration": 3.7663729190826416 }, { "auxiliary_loss_clip": 0.011012, "auxiliary_loss_mlp": 0.01041589, "balance_loss_clip": 1.02060866, "balance_loss_mlp": 1.03096867, "epoch": 0.198045994288291, "flos": 20119258080000.0, "grad_norm": 1.8060343049106946, "language_loss": 0.61091065, "learning_rate": 3.625551226553854e-06, "loss": 0.63233852, "num_input_tokens_seen": 71068665, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.703125, "step": 3294, "time_per_iteration": 2.3722171783447266 }, { "auxiliary_loss_clip": 0.01092659, "auxiliary_loss_mlp": 0.01038661, "balance_loss_clip": 1.01956379, "balance_loss_mlp": 1.02670622, "epoch": 0.19810611754095897, "flos": 17966688071040.0, "grad_norm": 1.9379544072355641, "language_loss": 0.87146139, "learning_rate": 3.6253310870025598e-06, "loss": 0.89277458, "num_input_tokens_seen": 71085320, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.66015625, "step": 3295, "time_per_iteration": 2.3429510593414307 }, { "auxiliary_loss_clip": 0.01092325, "auxiliary_loss_mlp": 0.01037285, "balance_loss_clip": 1.01948702, "balance_loss_mlp": 1.02713811, "epoch": 0.19816624079362694, "flos": 15084119111040.0, "grad_norm": 2.476934992282589, "language_loss": 0.80659974, "learning_rate": 3.6251108894473806e-06, "loss": 0.82789588, "num_input_tokens_seen": 71102020, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.65234375, "step": 3296, "time_per_iteration": 5.0992112159729 }, { "auxiliary_loss_clip": 0.01095761, "auxiliary_loss_mlp": 0.01034934, "balance_loss_clip": 1.01356006, "balance_loss_mlp": 1.0261941, "epoch": 0.1982263640462949, "flos": 24899147032320.0, "grad_norm": 1.9064652416671366, "language_loss": 0.68187982, "learning_rate": 3.624890633896173e-06, "loss": 0.70318681, "num_input_tokens_seen": 71123390, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.6953125, "step": 3297, "time_per_iteration": 2.414424180984497 }, { "auxiliary_loss_clip": 0.01090671, "auxiliary_loss_mlp": 0.01034454, "balance_loss_clip": 1.01708508, "balance_loss_mlp": 1.02733803, "epoch": 0.19828648729896287, "flos": 20375136501120.0, "grad_norm": 1.7080929907446294, "language_loss": 0.81423819, "learning_rate": 3.6246703203567996e-06, "loss": 0.83548945, "num_input_tokens_seen": 71141800, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.6328125, "step": 3298, "time_per_iteration": 2.3621015548706055 }, { "auxiliary_loss_clip": 0.01096755, "auxiliary_loss_mlp": 0.01040343, "balance_loss_clip": 1.01938558, "balance_loss_mlp": 1.02652717, "epoch": 0.19834661055163083, "flos": 18879038386560.0, "grad_norm": 1.8118896067170283, "language_loss": 0.8499251, "learning_rate": 3.624449948837121e-06, "loss": 0.87129605, "num_input_tokens_seen": 71159505, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.703125, "step": 3299, "time_per_iteration": 3.7855286598205566 }, { "auxiliary_loss_clip": 0.0102453, "auxiliary_loss_mlp": 0.01005913, "balance_loss_clip": 1.00327826, "balance_loss_mlp": 1.00592458, "epoch": 0.19840673380429882, "flos": 60255897292800.0, "grad_norm": 0.7660662056871961, "language_loss": 0.53273189, "learning_rate": 3.6242295193450024e-06, "loss": 0.55303633, "num_input_tokens_seen": 71223265, "router_z_loss_clip": 0.02636719, "router_z_loss_mlp": 0.18554688, "step": 3300, "time_per_iteration": 2.9814531803131104 }, { "auxiliary_loss_clip": 0.01093189, "auxiliary_loss_mlp": 0.01043868, "balance_loss_clip": 1.02426958, "balance_loss_mlp": 1.02672458, "epoch": 0.1984668570569668, "flos": 19900981059840.0, "grad_norm": 2.006506243334892, "language_loss": 0.72906816, "learning_rate": 3.6240090318883103e-06, "loss": 0.75043869, "num_input_tokens_seen": 71242385, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.6640625, "step": 3301, "time_per_iteration": 2.390085458755493 }, { "auxiliary_loss_clip": 0.01093821, "auxiliary_loss_mlp": 0.01034821, "balance_loss_clip": 1.01660538, "balance_loss_mlp": 1.0276103, "epoch": 0.19852698030963475, "flos": 15629916395520.0, "grad_norm": 2.3942277196780233, "language_loss": 0.88054079, "learning_rate": 3.623788486474913e-06, "loss": 0.90182722, "num_input_tokens_seen": 71258990, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.66015625, "step": 3302, "time_per_iteration": 2.3620595932006836 }, { "auxiliary_loss_clip": 0.01093732, "auxiliary_loss_mlp": 0.01034571, "balance_loss_clip": 1.01491392, "balance_loss_mlp": 1.02640676, "epoch": 0.19858710356230272, "flos": 43141335575040.0, "grad_norm": 1.8277191047193035, "language_loss": 0.73320621, "learning_rate": 3.623567883112682e-06, "loss": 0.7544893, "num_input_tokens_seen": 71282770, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.671875, "step": 3303, "time_per_iteration": 2.573152780532837 }, { "auxiliary_loss_clip": 0.01092105, "auxiliary_loss_mlp": 0.01041146, "balance_loss_clip": 1.02151227, "balance_loss_mlp": 1.02686501, "epoch": 0.19864722681497068, "flos": 35142873937920.0, "grad_norm": 1.8424346043034194, "language_loss": 0.74658036, "learning_rate": 3.6233472218094897e-06, "loss": 0.76791286, "num_input_tokens_seen": 71301410, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.65234375, "step": 3304, "time_per_iteration": 2.49125599861145 }, { "auxiliary_loss_clip": 0.01091011, "auxiliary_loss_mlp": 0.01033155, "balance_loss_clip": 1.01579833, "balance_loss_mlp": 1.02630901, "epoch": 0.19870735006763865, "flos": 19425219696000.0, "grad_norm": 2.7354527523926517, "language_loss": 0.85860914, "learning_rate": 3.62312650257321e-06, "loss": 0.8798508, "num_input_tokens_seen": 71319670, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.6484375, "step": 3305, "time_per_iteration": 2.3646278381347656 }, { "auxiliary_loss_clip": 0.01093773, "auxiliary_loss_mlp": 0.01033682, "balance_loss_clip": 1.01421475, "balance_loss_mlp": 1.02594137, "epoch": 0.19876747332030664, "flos": 23546332604160.0, "grad_norm": 1.5847495036578148, "language_loss": 0.68523008, "learning_rate": 3.622905725411721e-06, "loss": 0.70650458, "num_input_tokens_seen": 71339850, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.6796875, "step": 3306, "time_per_iteration": 2.4286417961120605 }, { "auxiliary_loss_clip": 0.01089723, "auxiliary_loss_mlp": 0.01030964, "balance_loss_clip": 1.01242733, "balance_loss_mlp": 1.02498138, "epoch": 0.1988275965729746, "flos": 19828361698560.0, "grad_norm": 1.9311524076025604, "language_loss": 0.76461613, "learning_rate": 3.622684890332901e-06, "loss": 0.78582305, "num_input_tokens_seen": 71359795, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.6484375, "step": 3307, "time_per_iteration": 2.4046173095703125 }, { "auxiliary_loss_clip": 0.01092585, "auxiliary_loss_mlp": 0.0103464, "balance_loss_clip": 1.01755738, "balance_loss_mlp": 1.02870154, "epoch": 0.19888771982564257, "flos": 23512501630080.0, "grad_norm": 2.169181339009595, "language_loss": 0.7575652, "learning_rate": 3.622463997344632e-06, "loss": 0.77883744, "num_input_tokens_seen": 71378885, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.63671875, "step": 3308, "time_per_iteration": 2.406249761581421 }, { "auxiliary_loss_clip": 0.01092841, "auxiliary_loss_mlp": 0.01034602, "balance_loss_clip": 1.01551628, "balance_loss_mlp": 1.02665699, "epoch": 0.19894784307831054, "flos": 18149528194560.0, "grad_norm": 3.2640222401270114, "language_loss": 0.75881577, "learning_rate": 3.622243046454796e-06, "loss": 0.78009021, "num_input_tokens_seen": 71397285, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.66015625, "step": 3309, "time_per_iteration": 2.353468894958496 }, { "auxiliary_loss_clip": 0.01090891, "auxiliary_loss_mlp": 0.01042274, "balance_loss_clip": 1.02351093, "balance_loss_mlp": 1.02639914, "epoch": 0.1990079663309785, "flos": 24275004923520.0, "grad_norm": 2.1937839839446713, "language_loss": 0.87566149, "learning_rate": 3.6220220376712787e-06, "loss": 0.89699316, "num_input_tokens_seen": 71415775, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.64453125, "step": 3310, "time_per_iteration": 2.4153454303741455 }, { "auxiliary_loss_clip": 0.01090099, "auxiliary_loss_mlp": 0.01032432, "balance_loss_clip": 1.01319122, "balance_loss_mlp": 1.02569306, "epoch": 0.19906808958364647, "flos": 34896212115840.0, "grad_norm": 2.7435653817024566, "language_loss": 0.64026791, "learning_rate": 3.621800971001967e-06, "loss": 0.66149318, "num_input_tokens_seen": 71437315, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.64453125, "step": 3311, "time_per_iteration": 2.482752561569214 }, { "auxiliary_loss_clip": 0.01094648, "auxiliary_loss_mlp": 0.01034431, "balance_loss_clip": 1.01598966, "balance_loss_mlp": 1.02681947, "epoch": 0.19912821283631443, "flos": 24023734801920.0, "grad_norm": 2.2046276982202686, "language_loss": 0.73585874, "learning_rate": 3.6215798464547505e-06, "loss": 0.75714952, "num_input_tokens_seen": 71456320, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.67578125, "step": 3312, "time_per_iteration": 2.39709734916687 }, { "auxiliary_loss_clip": 0.01089779, "auxiliary_loss_mlp": 0.01037231, "balance_loss_clip": 1.01886082, "balance_loss_mlp": 1.0253818, "epoch": 0.19918833608898243, "flos": 19858177866240.0, "grad_norm": 2.214729870251275, "language_loss": 0.83765405, "learning_rate": 3.6213586640375207e-06, "loss": 0.85892415, "num_input_tokens_seen": 71475360, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.64453125, "step": 3313, "time_per_iteration": 2.3777294158935547 }, { "auxiliary_loss_clip": 0.01094265, "auxiliary_loss_mlp": 0.01039953, "balance_loss_clip": 1.02130866, "balance_loss_mlp": 1.02869534, "epoch": 0.1992484593416504, "flos": 29094520135680.0, "grad_norm": 4.863015940835911, "language_loss": 0.80661523, "learning_rate": 3.6211374237581706e-06, "loss": 0.82795733, "num_input_tokens_seen": 71496155, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.65625, "step": 3314, "time_per_iteration": 2.4357516765594482 }, { "auxiliary_loss_clip": 0.01088398, "auxiliary_loss_mlp": 0.01033582, "balance_loss_clip": 1.01609409, "balance_loss_mlp": 1.02526283, "epoch": 0.19930858259431836, "flos": 23293875496320.0, "grad_norm": 1.444588805498063, "language_loss": 0.8718859, "learning_rate": 3.620916125624596e-06, "loss": 0.89310575, "num_input_tokens_seen": 71517295, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.6328125, "step": 3315, "time_per_iteration": 2.4119343757629395 }, { "auxiliary_loss_clip": 0.01093967, "auxiliary_loss_mlp": 0.01034429, "balance_loss_clip": 1.0160712, "balance_loss_mlp": 1.02733731, "epoch": 0.19936870584698632, "flos": 25377526748160.0, "grad_norm": 1.5396054078786452, "language_loss": 0.7089622, "learning_rate": 3.620694769644694e-06, "loss": 0.73024619, "num_input_tokens_seen": 71540000, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.66796875, "step": 3316, "time_per_iteration": 2.441154956817627 }, { "auxiliary_loss_clip": 0.01023534, "auxiliary_loss_mlp": 0.01004258, "balance_loss_clip": 1.00170732, "balance_loss_mlp": 1.00516033, "epoch": 0.1994288290996543, "flos": 62164388920320.0, "grad_norm": 0.8381400898957825, "language_loss": 0.66274536, "learning_rate": 3.6204733558263653e-06, "loss": 0.68302333, "num_input_tokens_seen": 71607880, "router_z_loss_clip": 0.0255127, "router_z_loss_mlp": 0.18359375, "step": 3317, "time_per_iteration": 3.0949625968933105 }, { "auxiliary_loss_clip": 0.01095287, "auxiliary_loss_mlp": 0.01039173, "balance_loss_clip": 1.02044487, "balance_loss_mlp": 1.02687836, "epoch": 0.19948895235232225, "flos": 19024835690880.0, "grad_norm": 3.3600161071849532, "language_loss": 0.74098063, "learning_rate": 3.6202518841775104e-06, "loss": 0.76232529, "num_input_tokens_seen": 71625695, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.68359375, "step": 3318, "time_per_iteration": 2.3706955909729004 }, { "auxiliary_loss_clip": 0.01088429, "auxiliary_loss_mlp": 0.01038061, "balance_loss_clip": 1.02025127, "balance_loss_mlp": 1.02606761, "epoch": 0.19954907560499022, "flos": 37814287795200.0, "grad_norm": 1.9049686259150524, "language_loss": 0.78945422, "learning_rate": 3.6200303547060336e-06, "loss": 0.81071913, "num_input_tokens_seen": 71648520, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.625, "step": 3319, "time_per_iteration": 2.5089056491851807 }, { "auxiliary_loss_clip": 0.01091702, "auxiliary_loss_mlp": 0.01034212, "balance_loss_clip": 1.01383972, "balance_loss_mlp": 1.02566719, "epoch": 0.1996091988576582, "flos": 49563329414400.0, "grad_norm": 2.4920940963466145, "language_loss": 0.76352167, "learning_rate": 3.61980876741984e-06, "loss": 0.78478074, "num_input_tokens_seen": 71672185, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.66015625, "step": 3320, "time_per_iteration": 2.6354074478149414 }, { "auxiliary_loss_clip": 0.01090883, "auxiliary_loss_mlp": 0.01039099, "balance_loss_clip": 1.02159953, "balance_loss_mlp": 1.02575707, "epoch": 0.19966932211032618, "flos": 22634750338560.0, "grad_norm": 2.207245808725498, "language_loss": 0.80172241, "learning_rate": 3.6195871223268392e-06, "loss": 0.82302225, "num_input_tokens_seen": 71692890, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.65234375, "step": 3321, "time_per_iteration": 2.3941807746887207 }, { "auxiliary_loss_clip": 0.0102174, "auxiliary_loss_mlp": 0.01001946, "balance_loss_clip": 0.99951375, "balance_loss_mlp": 1.00350666, "epoch": 0.19972944536299414, "flos": 54079308466560.0, "grad_norm": 0.8696088293687311, "language_loss": 0.65128511, "learning_rate": 3.61936541943494e-06, "loss": 0.67152202, "num_input_tokens_seen": 71745815, "router_z_loss_clip": 0.02429199, "router_z_loss_mlp": 0.18261719, "step": 3322, "time_per_iteration": 2.8282928466796875 }, { "auxiliary_loss_clip": 0.01021052, "auxiliary_loss_mlp": 0.01002368, "balance_loss_clip": 0.99996036, "balance_loss_mlp": 1.00306439, "epoch": 0.1997895686156621, "flos": 69352204498560.0, "grad_norm": 0.7836553028040963, "language_loss": 0.56965047, "learning_rate": 3.619143658752054e-06, "loss": 0.5898847, "num_input_tokens_seen": 71806915, "router_z_loss_clip": 0.02404785, "router_z_loss_mlp": 0.1796875, "step": 3323, "time_per_iteration": 3.154853105545044 }, { "auxiliary_loss_clip": 0.01092334, "auxiliary_loss_mlp": 0.01036843, "balance_loss_clip": 1.01741147, "balance_loss_mlp": 1.02751148, "epoch": 0.19984969186833007, "flos": 18551064274560.0, "grad_norm": 2.64619446476262, "language_loss": 0.80301172, "learning_rate": 3.6189218402860958e-06, "loss": 0.82430351, "num_input_tokens_seen": 71824645, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.6484375, "step": 3324, "time_per_iteration": 2.364062786102295 }, { "auxiliary_loss_clip": 0.01089631, "auxiliary_loss_mlp": 0.0104084, "balance_loss_clip": 1.02077699, "balance_loss_mlp": 1.02509081, "epoch": 0.19990981512099804, "flos": 26428552450560.0, "grad_norm": 1.755109624568066, "language_loss": 0.54017216, "learning_rate": 3.6186999640449817e-06, "loss": 0.56147689, "num_input_tokens_seen": 71845125, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.64453125, "step": 3325, "time_per_iteration": 2.447310447692871 }, { "auxiliary_loss_clip": 0.01091465, "auxiliary_loss_mlp": 0.01034303, "balance_loss_clip": 1.01624262, "balance_loss_mlp": 1.02723396, "epoch": 0.19996993837366603, "flos": 16325071943040.0, "grad_norm": 2.354431826748387, "language_loss": 0.85965687, "learning_rate": 3.6184780300366294e-06, "loss": 0.88091457, "num_input_tokens_seen": 71863500, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.64453125, "step": 3326, "time_per_iteration": 2.361738443374634 }, { "auxiliary_loss_clip": 0.01090149, "auxiliary_loss_mlp": 0.01036799, "balance_loss_clip": 1.01886964, "balance_loss_mlp": 1.02702761, "epoch": 0.200030061626334, "flos": 20843287188480.0, "grad_norm": 1.8797726577910963, "language_loss": 0.71729505, "learning_rate": 3.6182560382689598e-06, "loss": 0.73856449, "num_input_tokens_seen": 71881845, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.6328125, "step": 3327, "time_per_iteration": 2.370218515396118 }, { "auxiliary_loss_clip": 0.01093333, "auxiliary_loss_mlp": 0.01038162, "balance_loss_clip": 1.0185287, "balance_loss_mlp": 1.02680278, "epoch": 0.20009018487900196, "flos": 23761677070080.0, "grad_norm": 1.981411592710895, "language_loss": 0.76705289, "learning_rate": 3.6180339887498948e-06, "loss": 0.78836781, "num_input_tokens_seen": 71900940, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.6640625, "step": 3328, "time_per_iteration": 2.3940131664276123 }, { "auxiliary_loss_clip": 0.01087467, "auxiliary_loss_mlp": 0.01032658, "balance_loss_clip": 1.01604056, "balance_loss_mlp": 1.02578974, "epoch": 0.20015030813166992, "flos": 28110283597440.0, "grad_norm": 1.7915055564184035, "language_loss": 0.6896466, "learning_rate": 3.6178118814873587e-06, "loss": 0.71084785, "num_input_tokens_seen": 71921925, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.6171875, "step": 3329, "time_per_iteration": 2.536358594894409 }, { "auxiliary_loss_clip": 0.010967, "auxiliary_loss_mlp": 0.01038771, "balance_loss_clip": 1.01779008, "balance_loss_mlp": 1.02862477, "epoch": 0.2002104313843379, "flos": 26065979314560.0, "grad_norm": 1.5752217212677344, "language_loss": 0.81327724, "learning_rate": 3.6175897164892783e-06, "loss": 0.83463192, "num_input_tokens_seen": 71941855, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.6796875, "step": 3330, "time_per_iteration": 2.4372637271881104 }, { "auxiliary_loss_clip": 0.01092833, "auxiliary_loss_mlp": 0.01030856, "balance_loss_clip": 1.01161528, "balance_loss_mlp": 1.02617478, "epoch": 0.20027055463700585, "flos": 22965517359360.0, "grad_norm": 4.3997616609776005, "language_loss": 0.76216048, "learning_rate": 3.617367493763581e-06, "loss": 0.78339738, "num_input_tokens_seen": 71960915, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.66796875, "step": 3331, "time_per_iteration": 2.3871850967407227 }, { "auxiliary_loss_clip": 0.01093251, "auxiliary_loss_mlp": 0.01037035, "balance_loss_clip": 1.0170908, "balance_loss_mlp": 1.026245, "epoch": 0.20033067788967382, "flos": 17164698163200.0, "grad_norm": 1.8408896854764203, "language_loss": 0.79128915, "learning_rate": 3.6171452133181994e-06, "loss": 0.81259203, "num_input_tokens_seen": 71979220, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.671875, "step": 3332, "time_per_iteration": 3.806399345397949 }, { "auxiliary_loss_clip": 0.01021104, "auxiliary_loss_mlp": 0.01010393, "balance_loss_clip": 1.00805676, "balance_loss_mlp": 1.00349379, "epoch": 0.2003908011423418, "flos": 60822747993600.0, "grad_norm": 0.935613534119169, "language_loss": 0.61915773, "learning_rate": 3.6169228751610643e-06, "loss": 0.63947272, "num_input_tokens_seen": 72033950, "router_z_loss_clip": 0.02331543, "router_z_loss_mlp": 0.17578125, "step": 3333, "time_per_iteration": 2.9001359939575195 }, { "auxiliary_loss_clip": 0.01091975, "auxiliary_loss_mlp": 0.01042819, "balance_loss_clip": 1.02338839, "balance_loss_mlp": 1.02532458, "epoch": 0.20045092439500978, "flos": 24205108648320.0, "grad_norm": 2.1869137175770432, "language_loss": 0.80889475, "learning_rate": 3.6167004793001107e-06, "loss": 0.83024263, "num_input_tokens_seen": 72051395, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.6640625, "step": 3334, "time_per_iteration": 2.3922924995422363 }, { "auxiliary_loss_clip": 0.01098087, "auxiliary_loss_mlp": 0.01040421, "balance_loss_clip": 1.01921344, "balance_loss_mlp": 1.02878976, "epoch": 0.20051104764767774, "flos": 29386324212480.0, "grad_norm": 1.845156825894402, "language_loss": 0.73663443, "learning_rate": 3.616478025743276e-06, "loss": 0.75801957, "num_input_tokens_seen": 72071305, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.6953125, "step": 3335, "time_per_iteration": 3.7880725860595703 }, { "auxiliary_loss_clip": 0.01098442, "auxiliary_loss_mlp": 0.01046572, "balance_loss_clip": 1.02708125, "balance_loss_mlp": 1.03009892, "epoch": 0.2005711709003457, "flos": 23512676186880.0, "grad_norm": 2.0307575494379106, "language_loss": 0.80261171, "learning_rate": 3.6162555144984986e-06, "loss": 0.82406187, "num_input_tokens_seen": 72090165, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.68359375, "step": 3336, "time_per_iteration": 3.768510341644287 }, { "auxiliary_loss_clip": 0.0109644, "auxiliary_loss_mlp": 0.01039487, "balance_loss_clip": 1.01849413, "balance_loss_mlp": 1.02602303, "epoch": 0.20063129415301367, "flos": 22522434894720.0, "grad_norm": 2.4752090450551316, "language_loss": 0.77787721, "learning_rate": 3.6160329455737193e-06, "loss": 0.79923654, "num_input_tokens_seen": 72107210, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.703125, "step": 3337, "time_per_iteration": 2.377169132232666 }, { "auxiliary_loss_clip": 0.01097078, "auxiliary_loss_mlp": 0.01045507, "balance_loss_clip": 1.02416873, "balance_loss_mlp": 1.02941728, "epoch": 0.20069141740568164, "flos": 25957050272640.0, "grad_norm": 1.908908843564825, "language_loss": 0.68563581, "learning_rate": 3.6158103189768815e-06, "loss": 0.70706165, "num_input_tokens_seen": 72126315, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.67578125, "step": 3338, "time_per_iteration": 2.4194326400756836 }, { "auxiliary_loss_clip": 0.01093844, "auxiliary_loss_mlp": 0.01041333, "balance_loss_clip": 1.0220685, "balance_loss_mlp": 1.02799261, "epoch": 0.2007515406583496, "flos": 24789449940480.0, "grad_norm": 1.9056828358542846, "language_loss": 0.68690825, "learning_rate": 3.6155876347159296e-06, "loss": 0.70825994, "num_input_tokens_seen": 72146470, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.66015625, "step": 3339, "time_per_iteration": 3.78161883354187 }, { "auxiliary_loss_clip": 0.01098606, "auxiliary_loss_mlp": 0.01035575, "balance_loss_clip": 1.01578593, "balance_loss_mlp": 1.02914751, "epoch": 0.2008116639110176, "flos": 37924054709760.0, "grad_norm": 2.1736325201628337, "language_loss": 0.66454792, "learning_rate": 3.6153648927988104e-06, "loss": 0.68588972, "num_input_tokens_seen": 72166600, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.6953125, "step": 3340, "time_per_iteration": 2.5375027656555176 }, { "auxiliary_loss_clip": 0.01097662, "auxiliary_loss_mlp": 0.01037098, "balance_loss_clip": 1.01603413, "balance_loss_mlp": 1.0282203, "epoch": 0.20087178716368556, "flos": 20739490116480.0, "grad_norm": 2.124235432886837, "language_loss": 0.73798639, "learning_rate": 3.6151420932334737e-06, "loss": 0.75933397, "num_input_tokens_seen": 72185160, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.6953125, "step": 3341, "time_per_iteration": 2.3896074295043945 }, { "auxiliary_loss_clip": 0.01092513, "auxiliary_loss_mlp": 0.01037874, "balance_loss_clip": 1.01921797, "balance_loss_mlp": 1.02799368, "epoch": 0.20093191041635353, "flos": 23841139057920.0, "grad_norm": 2.0304712392790076, "language_loss": 0.71718943, "learning_rate": 3.6149192360278706e-06, "loss": 0.73849332, "num_input_tokens_seen": 72205160, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.64453125, "step": 3342, "time_per_iteration": 2.3963634967803955 }, { "auxiliary_loss_clip": 0.01094734, "auxiliary_loss_mlp": 0.01041169, "balance_loss_clip": 1.02151191, "balance_loss_mlp": 1.02824581, "epoch": 0.2009920336690215, "flos": 21791179134720.0, "grad_norm": 2.003400364739572, "language_loss": 0.72176361, "learning_rate": 3.614696321189954e-06, "loss": 0.74312264, "num_input_tokens_seen": 72223555, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.6640625, "step": 3343, "time_per_iteration": 2.389746904373169 }, { "auxiliary_loss_clip": 0.01096642, "auxiliary_loss_mlp": 0.01040227, "balance_loss_clip": 1.01825643, "balance_loss_mlp": 1.02871799, "epoch": 0.20105215692168946, "flos": 26358027770880.0, "grad_norm": 2.104630117063403, "language_loss": 0.80623066, "learning_rate": 3.614473348727679e-06, "loss": 0.82759929, "num_input_tokens_seen": 72242465, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 0.6796875, "step": 3344, "time_per_iteration": 2.4078948497772217 }, { "auxiliary_loss_clip": 0.01094111, "auxiliary_loss_mlp": 0.01037183, "balance_loss_clip": 1.01802564, "balance_loss_mlp": 1.0274086, "epoch": 0.20111228017435742, "flos": 18806279379840.0, "grad_norm": 1.9719152022919537, "language_loss": 0.83162439, "learning_rate": 3.614250318649003e-06, "loss": 0.85293734, "num_input_tokens_seen": 72260655, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.66796875, "step": 3345, "time_per_iteration": 2.371562957763672 }, { "auxiliary_loss_clip": 0.01089602, "auxiliary_loss_mlp": 0.01035226, "balance_loss_clip": 1.01720166, "balance_loss_mlp": 1.02813196, "epoch": 0.20117240342702541, "flos": 19974019357440.0, "grad_norm": 3.4927529554161354, "language_loss": 0.67638123, "learning_rate": 3.614027230961885e-06, "loss": 0.69762951, "num_input_tokens_seen": 72279055, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.61328125, "step": 3346, "time_per_iteration": 2.3736603260040283 }, { "auxiliary_loss_clip": 0.01094478, "auxiliary_loss_mlp": 0.01044854, "balance_loss_clip": 1.02589989, "balance_loss_mlp": 1.02834606, "epoch": 0.20123252667969338, "flos": 23141759235840.0, "grad_norm": 2.1909215971763474, "language_loss": 0.7360484, "learning_rate": 3.613804085674288e-06, "loss": 0.75744176, "num_input_tokens_seen": 72297895, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.6640625, "step": 3347, "time_per_iteration": 2.379194974899292 }, { "auxiliary_loss_clip": 0.01094469, "auxiliary_loss_mlp": 0.01041893, "balance_loss_clip": 1.02266407, "balance_loss_mlp": 1.02856326, "epoch": 0.20129264993236134, "flos": 23220557907840.0, "grad_norm": 1.6814276601146283, "language_loss": 0.86456525, "learning_rate": 3.6135808827941733e-06, "loss": 0.88592887, "num_input_tokens_seen": 72318385, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.65625, "step": 3348, "time_per_iteration": 2.419203519821167 }, { "auxiliary_loss_clip": 0.01089155, "auxiliary_loss_mlp": 0.01037182, "balance_loss_clip": 1.01655841, "balance_loss_mlp": 1.02553391, "epoch": 0.2013527731850293, "flos": 21870396743040.0, "grad_norm": 1.560812432538016, "language_loss": 0.70809293, "learning_rate": 3.6133576223295083e-06, "loss": 0.72935629, "num_input_tokens_seen": 72338235, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.63671875, "step": 3349, "time_per_iteration": 2.3877413272857666 }, { "auxiliary_loss_clip": 0.01095388, "auxiliary_loss_mlp": 0.01038328, "balance_loss_clip": 1.01774049, "balance_loss_mlp": 1.02822638, "epoch": 0.20141289643769728, "flos": 18039831102720.0, "grad_norm": 2.5211327439086255, "language_loss": 0.71299899, "learning_rate": 3.61313430428826e-06, "loss": 0.73433614, "num_input_tokens_seen": 72357825, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.671875, "step": 3350, "time_per_iteration": 2.377131700515747 }, { "auxiliary_loss_clip": 0.01095739, "auxiliary_loss_mlp": 0.01040096, "balance_loss_clip": 1.01918685, "balance_loss_mlp": 1.02833414, "epoch": 0.20147301969036524, "flos": 23950277568000.0, "grad_norm": 2.330382344330002, "language_loss": 0.76556957, "learning_rate": 3.612910928678397e-06, "loss": 0.78692788, "num_input_tokens_seen": 72376335, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.671875, "step": 3351, "time_per_iteration": 2.3786630630493164 }, { "auxiliary_loss_clip": 0.01095691, "auxiliary_loss_mlp": 0.01040055, "balance_loss_clip": 1.01907384, "balance_loss_mlp": 1.02656364, "epoch": 0.2015331429430332, "flos": 25587425041920.0, "grad_norm": 2.4628884995422418, "language_loss": 0.80423838, "learning_rate": 3.6126874955078926e-06, "loss": 0.82559586, "num_input_tokens_seen": 72395440, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.69140625, "step": 3352, "time_per_iteration": 2.4034831523895264 }, { "auxiliary_loss_clip": 0.01096008, "auxiliary_loss_mlp": 0.01035413, "balance_loss_clip": 1.01550508, "balance_loss_mlp": 1.02854121, "epoch": 0.2015932661957012, "flos": 26723742929280.0, "grad_norm": 2.4913601598434134, "language_loss": 0.80097902, "learning_rate": 3.6124640047847193e-06, "loss": 0.82229328, "num_input_tokens_seen": 72414670, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.67578125, "step": 3353, "time_per_iteration": 2.4384377002716064 }, { "auxiliary_loss_clip": 0.01088909, "auxiliary_loss_mlp": 0.01034565, "balance_loss_clip": 1.0159328, "balance_loss_mlp": 1.02580047, "epoch": 0.20165338944836916, "flos": 15632220545280.0, "grad_norm": 1.8571748381955588, "language_loss": 0.89597869, "learning_rate": 3.6122404565168533e-06, "loss": 0.91721344, "num_input_tokens_seen": 72432210, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.6328125, "step": 3354, "time_per_iteration": 2.3681228160858154 }, { "auxiliary_loss_clip": 0.01022829, "auxiliary_loss_mlp": 0.01010027, "balance_loss_clip": 1.00754726, "balance_loss_mlp": 1.00564718, "epoch": 0.20171351270103713, "flos": 57909629727360.0, "grad_norm": 0.832773605290989, "language_loss": 0.55907285, "learning_rate": 3.612016850712273e-06, "loss": 0.57940137, "num_input_tokens_seen": 72489225, "router_z_loss_clip": 0.02478027, "router_z_loss_mlp": 0.171875, "step": 3355, "time_per_iteration": 2.9575581550598145 }, { "auxiliary_loss_clip": 0.01090334, "auxiliary_loss_mlp": 0.01040644, "balance_loss_clip": 1.02155828, "balance_loss_mlp": 1.02739358, "epoch": 0.2017736359537051, "flos": 20813296464000.0, "grad_norm": 3.6784021252170906, "language_loss": 0.84033597, "learning_rate": 3.611793187378958e-06, "loss": 0.8616457, "num_input_tokens_seen": 72508715, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.62890625, "step": 3356, "time_per_iteration": 2.3754444122314453 }, { "auxiliary_loss_clip": 0.01099126, "auxiliary_loss_mlp": 0.01044522, "balance_loss_clip": 1.02122903, "balance_loss_mlp": 1.02733374, "epoch": 0.20183375920637306, "flos": 17091101283840.0, "grad_norm": 3.0902134296115134, "language_loss": 0.69024551, "learning_rate": 3.61156946652489e-06, "loss": 0.7116819, "num_input_tokens_seen": 72525135, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 0.71875, "step": 3357, "time_per_iteration": 2.348914861679077 }, { "auxiliary_loss_clip": 0.01094203, "auxiliary_loss_mlp": 0.01039822, "balance_loss_clip": 1.01940107, "balance_loss_mlp": 1.02652955, "epoch": 0.20189388245904102, "flos": 18660342430080.0, "grad_norm": 2.08794261793352, "language_loss": 0.71407759, "learning_rate": 3.611345688158053e-06, "loss": 0.73541784, "num_input_tokens_seen": 72543690, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.6796875, "step": 3358, "time_per_iteration": 2.3747360706329346 }, { "auxiliary_loss_clip": 0.01089285, "auxiliary_loss_mlp": 0.01036534, "balance_loss_clip": 1.01908135, "balance_loss_mlp": 1.02508187, "epoch": 0.20195400571170902, "flos": 16796678855040.0, "grad_norm": 1.6969373474453948, "language_loss": 0.82704282, "learning_rate": 3.6111218522864336e-06, "loss": 0.84830093, "num_input_tokens_seen": 72560725, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.640625, "step": 3359, "time_per_iteration": 2.3471667766571045 }, { "auxiliary_loss_clip": 0.01022401, "auxiliary_loss_mlp": 0.010032, "balance_loss_clip": 1.00067246, "balance_loss_mlp": 1.00500762, "epoch": 0.20201412896437698, "flos": 67171703535360.0, "grad_norm": 0.797525516278317, "language_loss": 0.58966005, "learning_rate": 3.610897958918019e-06, "loss": 0.60991603, "num_input_tokens_seen": 72621940, "router_z_loss_clip": 0.02526855, "router_z_loss_mlp": 0.17382812, "step": 3360, "time_per_iteration": 2.9902760982513428 }, { "auxiliary_loss_clip": 0.01094829, "auxiliary_loss_mlp": 0.010354, "balance_loss_clip": 1.01514673, "balance_loss_mlp": 1.02801824, "epoch": 0.20207425221704495, "flos": 21323936142720.0, "grad_norm": 2.754692304580016, "language_loss": 0.62511683, "learning_rate": 3.6106740080608e-06, "loss": 0.64641917, "num_input_tokens_seen": 72639135, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.66796875, "step": 3361, "time_per_iteration": 2.374682903289795 }, { "auxiliary_loss_clip": 0.01092233, "auxiliary_loss_mlp": 0.01042311, "balance_loss_clip": 1.02411985, "balance_loss_mlp": 1.02760386, "epoch": 0.2021343754697129, "flos": 22526100587520.0, "grad_norm": 2.081613946848037, "language_loss": 0.75787866, "learning_rate": 3.61044999972277e-06, "loss": 0.77922404, "num_input_tokens_seen": 72658525, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.64453125, "step": 3362, "time_per_iteration": 2.442420482635498 }, { "auxiliary_loss_clip": 0.01092394, "auxiliary_loss_mlp": 0.01038864, "balance_loss_clip": 1.02045846, "balance_loss_mlp": 1.02810884, "epoch": 0.20219449872238088, "flos": 19061773776000.0, "grad_norm": 2.1816263421883795, "language_loss": 0.7692908, "learning_rate": 3.610225933911921e-06, "loss": 0.79060346, "num_input_tokens_seen": 72678085, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.640625, "step": 3363, "time_per_iteration": 2.364548921585083 }, { "auxiliary_loss_clip": 0.01093417, "auxiliary_loss_mlp": 0.01036413, "balance_loss_clip": 1.01863885, "balance_loss_mlp": 1.02827358, "epoch": 0.20225462197504884, "flos": 24715887972480.0, "grad_norm": 1.7871976773933484, "language_loss": 0.74927402, "learning_rate": 3.6100018106362507e-06, "loss": 0.7705723, "num_input_tokens_seen": 72698695, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.6484375, "step": 3364, "time_per_iteration": 2.4184486865997314 }, { "auxiliary_loss_clip": 0.01094641, "auxiliary_loss_mlp": 0.01043725, "balance_loss_clip": 1.0238291, "balance_loss_mlp": 1.02821565, "epoch": 0.2023147452277168, "flos": 22017206476800.0, "grad_norm": 3.0410914034095593, "language_loss": 0.71063465, "learning_rate": 3.6097776299037573e-06, "loss": 0.73201829, "num_input_tokens_seen": 72717880, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.6640625, "step": 3365, "time_per_iteration": 2.387681007385254 }, { "auxiliary_loss_clip": 0.01093729, "auxiliary_loss_mlp": 0.01041783, "balance_loss_clip": 1.02231598, "balance_loss_mlp": 1.02873588, "epoch": 0.2023748684803848, "flos": 17744500978560.0, "grad_norm": 1.986425390483882, "language_loss": 0.8576386, "learning_rate": 3.609553391722441e-06, "loss": 0.87899375, "num_input_tokens_seen": 72736410, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.65234375, "step": 3366, "time_per_iteration": 2.3731088638305664 }, { "auxiliary_loss_clip": 0.01091239, "auxiliary_loss_mlp": 0.01033907, "balance_loss_clip": 1.01612079, "balance_loss_mlp": 1.02765083, "epoch": 0.20243499173305277, "flos": 31137602520960.0, "grad_norm": 1.6715117426982948, "language_loss": 0.69499671, "learning_rate": 3.6093290961003044e-06, "loss": 0.71624815, "num_input_tokens_seen": 72758295, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.6328125, "step": 3367, "time_per_iteration": 2.4572179317474365 }, { "auxiliary_loss_clip": 0.01096051, "auxiliary_loss_mlp": 0.01039835, "balance_loss_clip": 1.01739931, "balance_loss_mlp": 1.02729523, "epoch": 0.20249511498572073, "flos": 33837820116480.0, "grad_norm": 1.7574362353739208, "language_loss": 0.68120944, "learning_rate": 3.6091047430453517e-06, "loss": 0.70256829, "num_input_tokens_seen": 72782495, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 0.6875, "step": 3368, "time_per_iteration": 2.5089542865753174 }, { "auxiliary_loss_clip": 0.01093486, "auxiliary_loss_mlp": 0.01039775, "balance_loss_clip": 1.0203675, "balance_loss_mlp": 1.02808619, "epoch": 0.2025552382383887, "flos": 21214553253120.0, "grad_norm": 1.6910014215550044, "language_loss": 0.771227, "learning_rate": 3.6088803325655907e-06, "loss": 0.79255962, "num_input_tokens_seen": 72801885, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.65625, "step": 3369, "time_per_iteration": 2.3900160789489746 }, { "auxiliary_loss_clip": 0.01094711, "auxiliary_loss_mlp": 0.01046861, "balance_loss_clip": 1.02626193, "balance_loss_mlp": 1.02703071, "epoch": 0.20261536149105666, "flos": 14646517729920.0, "grad_norm": 3.3090264824915923, "language_loss": 0.65069675, "learning_rate": 3.6086558646690284e-06, "loss": 0.67211252, "num_input_tokens_seen": 72816990, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.6796875, "step": 3370, "time_per_iteration": 2.368041515350342 }, { "auxiliary_loss_clip": 0.01019647, "auxiliary_loss_mlp": 0.01002803, "balance_loss_clip": 1.00054967, "balance_loss_mlp": 1.00291896, "epoch": 0.20267548474372463, "flos": 66780361572480.0, "grad_norm": 0.6786419455732046, "language_loss": 0.58097756, "learning_rate": 3.608431339363677e-06, "loss": 0.60120201, "num_input_tokens_seen": 72879240, "router_z_loss_clip": 0.02258301, "router_z_loss_mlp": 0.16796875, "step": 3371, "time_per_iteration": 3.1006276607513428 }, { "auxiliary_loss_clip": 0.0109262, "auxiliary_loss_mlp": 0.01036929, "balance_loss_clip": 1.01752126, "balance_loss_mlp": 1.0270896, "epoch": 0.2027356079963926, "flos": 24679648114560.0, "grad_norm": 2.482154933139551, "language_loss": 0.91939795, "learning_rate": 3.6082067566575474e-06, "loss": 0.94069338, "num_input_tokens_seen": 72899030, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.65625, "step": 3372, "time_per_iteration": 3.8045270442962646 }, { "auxiliary_loss_clip": 0.01094782, "auxiliary_loss_mlp": 0.01046025, "balance_loss_clip": 1.02521157, "balance_loss_mlp": 1.02801609, "epoch": 0.20279573124906058, "flos": 26391649276800.0, "grad_norm": 1.5398457785339825, "language_loss": 0.78556454, "learning_rate": 3.6079821165586563e-06, "loss": 0.80697268, "num_input_tokens_seen": 72919190, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.66796875, "step": 3373, "time_per_iteration": 2.4265899658203125 }, { "auxiliary_loss_clip": 0.01090916, "auxiliary_loss_mlp": 0.01037119, "balance_loss_clip": 1.01793838, "balance_loss_mlp": 1.02722061, "epoch": 0.20285585450172855, "flos": 33798647704320.0, "grad_norm": 1.9416745036578418, "language_loss": 0.71174419, "learning_rate": 3.6077574190750194e-06, "loss": 0.73302448, "num_input_tokens_seen": 72939720, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.63671875, "step": 3374, "time_per_iteration": 3.846038579940796 }, { "auxiliary_loss_clip": 0.010204, "auxiliary_loss_mlp": 0.01003981, "balance_loss_clip": 1.00163269, "balance_loss_mlp": 1.00372291, "epoch": 0.20291597775439651, "flos": 71161332796800.0, "grad_norm": 0.9958992296052362, "language_loss": 0.62452167, "learning_rate": 3.607532664214656e-06, "loss": 0.6447655, "num_input_tokens_seen": 73000015, "router_z_loss_clip": 0.0234375, "router_z_loss_mlp": 0.16699219, "step": 3375, "time_per_iteration": 4.376126289367676 }, { "auxiliary_loss_clip": 0.01090797, "auxiliary_loss_mlp": 0.0104347, "balance_loss_clip": 1.02424145, "balance_loss_mlp": 1.02583313, "epoch": 0.20297610100706448, "flos": 19493440225920.0, "grad_norm": 1.558875313038995, "language_loss": 0.82283205, "learning_rate": 3.6073078519855863e-06, "loss": 0.84417474, "num_input_tokens_seen": 73017675, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.6484375, "step": 3376, "time_per_iteration": 2.366511583328247 }, { "auxiliary_loss_clip": 0.01093643, "auxiliary_loss_mlp": 0.01032397, "balance_loss_clip": 1.01220286, "balance_loss_mlp": 1.02646852, "epoch": 0.20303622425973245, "flos": 25043128945920.0, "grad_norm": 2.0001322716041026, "language_loss": 0.81369841, "learning_rate": 3.607082982395835e-06, "loss": 0.83495879, "num_input_tokens_seen": 73036135, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.671875, "step": 3377, "time_per_iteration": 2.394261121749878 }, { "auxiliary_loss_clip": 0.01090901, "auxiliary_loss_mlp": 0.01045261, "balance_loss_clip": 1.02623534, "balance_loss_mlp": 1.02719855, "epoch": 0.2030963475124004, "flos": 21978941760000.0, "grad_norm": 2.1762421344037968, "language_loss": 0.7660687, "learning_rate": 3.6068580554534245e-06, "loss": 0.78743029, "num_input_tokens_seen": 73054075, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.63671875, "step": 3378, "time_per_iteration": 3.735699415206909 }, { "auxiliary_loss_clip": 0.01092641, "auxiliary_loss_mlp": 0.01039045, "balance_loss_clip": 1.01862407, "balance_loss_mlp": 1.02671015, "epoch": 0.2031564707650684, "flos": 19499375157120.0, "grad_norm": 1.7978053934395233, "language_loss": 0.79383403, "learning_rate": 3.6066330711663845e-06, "loss": 0.81515092, "num_input_tokens_seen": 73073530, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.66015625, "step": 3379, "time_per_iteration": 2.3794503211975098 }, { "auxiliary_loss_clip": 0.01088602, "auxiliary_loss_mlp": 0.01035663, "balance_loss_clip": 1.01787734, "balance_loss_mlp": 1.02662444, "epoch": 0.20321659401773637, "flos": 22745983530240.0, "grad_norm": 1.6450798738094692, "language_loss": 0.86684787, "learning_rate": 3.606408029542743e-06, "loss": 0.88809049, "num_input_tokens_seen": 73092820, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.62109375, "step": 3380, "time_per_iteration": 2.4105381965637207 }, { "auxiliary_loss_clip": 0.01094082, "auxiliary_loss_mlp": 0.01042201, "balance_loss_clip": 1.02209008, "balance_loss_mlp": 1.02926743, "epoch": 0.20327671727040433, "flos": 22454738035200.0, "grad_norm": 1.8939562049784857, "language_loss": 0.74317086, "learning_rate": 3.60618293059053e-06, "loss": 0.76453364, "num_input_tokens_seen": 73113385, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.6484375, "step": 3381, "time_per_iteration": 2.4410035610198975 }, { "auxiliary_loss_clip": 0.01094867, "auxiliary_loss_mlp": 0.01047308, "balance_loss_clip": 1.02769852, "balance_loss_mlp": 1.02803874, "epoch": 0.2033368405230723, "flos": 19534044003840.0, "grad_norm": 2.1740730139041373, "language_loss": 0.79045564, "learning_rate": 3.6059577743177803e-06, "loss": 0.81187737, "num_input_tokens_seen": 73131195, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.66796875, "step": 3382, "time_per_iteration": 2.3881824016571045 }, { "auxiliary_loss_clip": 0.01095466, "auxiliary_loss_mlp": 0.01035108, "balance_loss_clip": 1.01440144, "balance_loss_mlp": 1.02753496, "epoch": 0.20339696377574026, "flos": 13808357786880.0, "grad_norm": 2.669638909555472, "language_loss": 0.79990542, "learning_rate": 3.6057325607325293e-06, "loss": 0.82121116, "num_input_tokens_seen": 73148850, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.6796875, "step": 3383, "time_per_iteration": 2.354276418685913 }, { "auxiliary_loss_clip": 0.01093858, "auxiliary_loss_mlp": 0.01036705, "balance_loss_clip": 1.01661801, "balance_loss_mlp": 1.02673268, "epoch": 0.20345708702840823, "flos": 20338372972800.0, "grad_norm": 1.8328700479000588, "language_loss": 0.74354744, "learning_rate": 3.605507289842813e-06, "loss": 0.76485312, "num_input_tokens_seen": 73166775, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.671875, "step": 3384, "time_per_iteration": 2.384086847305298 }, { "auxiliary_loss_clip": 0.01097517, "auxiliary_loss_mlp": 0.01039666, "balance_loss_clip": 1.01760077, "balance_loss_mlp": 1.02787459, "epoch": 0.2035172102810762, "flos": 20333066446080.0, "grad_norm": 2.6285400968744397, "language_loss": 0.76375276, "learning_rate": 3.6052819616566717e-06, "loss": 0.78512466, "num_input_tokens_seen": 73183215, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.6953125, "step": 3385, "time_per_iteration": 2.371457815170288 }, { "auxiliary_loss_clip": 0.01092619, "auxiliary_loss_mlp": 0.01052824, "balance_loss_clip": 1.03224897, "balance_loss_mlp": 1.0266149, "epoch": 0.2035773335337442, "flos": 23329870974720.0, "grad_norm": 1.5632193165782053, "language_loss": 0.68682873, "learning_rate": 3.6050565761821464e-06, "loss": 0.70828313, "num_input_tokens_seen": 73203290, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.66015625, "step": 3386, "time_per_iteration": 2.4078545570373535 }, { "auxiliary_loss_clip": 0.01092784, "auxiliary_loss_mlp": 0.01054127, "balance_loss_clip": 1.03370607, "balance_loss_mlp": 1.02673006, "epoch": 0.20363745678641215, "flos": 28329014465280.0, "grad_norm": 1.3779751012375796, "language_loss": 0.81030715, "learning_rate": 3.6048311334272806e-06, "loss": 0.83177626, "num_input_tokens_seen": 73226185, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.66015625, "step": 3387, "time_per_iteration": 2.436722993850708 }, { "auxiliary_loss_clip": 0.01088018, "auxiliary_loss_mlp": 0.0103672, "balance_loss_clip": 1.01711559, "balance_loss_mlp": 1.02645564, "epoch": 0.20369758003908012, "flos": 18914684751360.0, "grad_norm": 2.268566145035477, "language_loss": 0.79522198, "learning_rate": 3.6046056334001195e-06, "loss": 0.81646937, "num_input_tokens_seen": 73243300, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.6171875, "step": 3388, "time_per_iteration": 2.3675179481506348 }, { "auxiliary_loss_clip": 0.01092747, "auxiliary_loss_mlp": 0.01042757, "balance_loss_clip": 1.02213335, "balance_loss_mlp": 1.02644062, "epoch": 0.20375770329174808, "flos": 19205546221440.0, "grad_norm": 2.317221942652294, "language_loss": 0.71999937, "learning_rate": 3.604380076108711e-06, "loss": 0.74135441, "num_input_tokens_seen": 73261490, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.6640625, "step": 3389, "time_per_iteration": 2.349846363067627 }, { "auxiliary_loss_clip": 0.01090671, "auxiliary_loss_mlp": 0.01039414, "balance_loss_clip": 1.01960146, "balance_loss_mlp": 1.02674413, "epoch": 0.20381782654441605, "flos": 19389992267520.0, "grad_norm": 1.988578197047473, "language_loss": 0.87115598, "learning_rate": 3.6041544615611047e-06, "loss": 0.89245689, "num_input_tokens_seen": 73280180, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.640625, "step": 3390, "time_per_iteration": 2.3579490184783936 }, { "auxiliary_loss_clip": 0.01092361, "auxiliary_loss_mlp": 0.01036621, "balance_loss_clip": 1.01826298, "balance_loss_mlp": 1.02729249, "epoch": 0.203877949797084, "flos": 24826527671040.0, "grad_norm": 1.7992059945882672, "language_loss": 0.7063992, "learning_rate": 3.6039287897653523e-06, "loss": 0.72768903, "num_input_tokens_seen": 73300680, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.6484375, "step": 3391, "time_per_iteration": 2.4349241256713867 }, { "auxiliary_loss_clip": 0.01091526, "auxiliary_loss_mlp": 0.01039392, "balance_loss_clip": 1.02031267, "balance_loss_mlp": 1.02667701, "epoch": 0.20393807304975198, "flos": 18002753372160.0, "grad_norm": 2.4840311671601554, "language_loss": 0.86306632, "learning_rate": 3.6037030607295063e-06, "loss": 0.88437545, "num_input_tokens_seen": 73316760, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.6484375, "step": 3392, "time_per_iteration": 2.3522307872772217 }, { "auxiliary_loss_clip": 0.01094558, "auxiliary_loss_mlp": 0.01040172, "balance_loss_clip": 1.02049112, "balance_loss_mlp": 1.0278697, "epoch": 0.20399819630241997, "flos": 24205841786880.0, "grad_norm": 1.6081761552515894, "language_loss": 0.8031919, "learning_rate": 3.603477274461624e-06, "loss": 0.82453918, "num_input_tokens_seen": 73339385, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.66796875, "step": 3393, "time_per_iteration": 2.431004285812378 }, { "auxiliary_loss_clip": 0.01089731, "auxiliary_loss_mlp": 0.01033757, "balance_loss_clip": 1.01573205, "balance_loss_mlp": 1.02630556, "epoch": 0.20405831955508794, "flos": 20776079088000.0, "grad_norm": 1.8735140560235117, "language_loss": 0.85764956, "learning_rate": 3.603251430969762e-06, "loss": 0.87888443, "num_input_tokens_seen": 73357235, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.6328125, "step": 3394, "time_per_iteration": 2.3881540298461914 }, { "auxiliary_loss_clip": 0.01090659, "auxiliary_loss_mlp": 0.01033694, "balance_loss_clip": 1.01533568, "balance_loss_mlp": 1.02663696, "epoch": 0.2041184428077559, "flos": 15486004304640.0, "grad_norm": 2.743829622847949, "language_loss": 0.84085333, "learning_rate": 3.603025530261981e-06, "loss": 0.86209691, "num_input_tokens_seen": 73374435, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.640625, "step": 3395, "time_per_iteration": 2.3404524326324463 }, { "auxiliary_loss_clip": 0.01092742, "auxiliary_loss_mlp": 0.01034949, "balance_loss_clip": 1.01531529, "balance_loss_mlp": 1.0249157, "epoch": 0.20417856606042387, "flos": 15587776517760.0, "grad_norm": 2.31580434959557, "language_loss": 0.83367699, "learning_rate": 3.602799572346342e-06, "loss": 0.85495389, "num_input_tokens_seen": 73391025, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.6796875, "step": 3396, "time_per_iteration": 2.343005895614624 }, { "auxiliary_loss_clip": 0.01091129, "auxiliary_loss_mlp": 0.01034107, "balance_loss_clip": 1.01504493, "balance_loss_mlp": 1.02700043, "epoch": 0.20423868931309183, "flos": 24278216768640.0, "grad_norm": 2.5392266252188356, "language_loss": 0.770015, "learning_rate": 3.602573557230909e-06, "loss": 0.7912674, "num_input_tokens_seen": 73409270, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.640625, "step": 3397, "time_per_iteration": 2.4021718502044678 }, { "auxiliary_loss_clip": 0.01091382, "auxiliary_loss_mlp": 0.01037151, "balance_loss_clip": 1.01880515, "balance_loss_mlp": 1.02668989, "epoch": 0.2042988125657598, "flos": 18614152834560.0, "grad_norm": 2.4693531404220836, "language_loss": 0.87320244, "learning_rate": 3.602347484923748e-06, "loss": 0.8944878, "num_input_tokens_seen": 73425225, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.6484375, "step": 3398, "time_per_iteration": 2.338831901550293 }, { "auxiliary_loss_clip": 0.0109228, "auxiliary_loss_mlp": 0.01039025, "balance_loss_clip": 1.02011812, "balance_loss_mlp": 1.02819514, "epoch": 0.2043589358184278, "flos": 17850462554880.0, "grad_norm": 2.014866073013619, "language_loss": 0.7797541, "learning_rate": 3.6021213554329277e-06, "loss": 0.80106717, "num_input_tokens_seen": 73440940, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.640625, "step": 3399, "time_per_iteration": 2.3305556774139404 }, { "auxiliary_loss_clip": 0.01090105, "auxiliary_loss_mlp": 0.01033464, "balance_loss_clip": 1.0142591, "balance_loss_mlp": 1.02567756, "epoch": 0.20441905907109575, "flos": 21434121993600.0, "grad_norm": 1.9548736773764572, "language_loss": 0.76317549, "learning_rate": 3.601895168766517e-06, "loss": 0.78441119, "num_input_tokens_seen": 73458805, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.64453125, "step": 3400, "time_per_iteration": 2.3623459339141846 }, { "auxiliary_loss_clip": 0.01091122, "auxiliary_loss_mlp": 0.01035199, "balance_loss_clip": 1.01837873, "balance_loss_mlp": 1.02779388, "epoch": 0.20447918232376372, "flos": 27706513190400.0, "grad_norm": 1.695005337189241, "language_loss": 0.79227334, "learning_rate": 3.601668924932588e-06, "loss": 0.81353664, "num_input_tokens_seen": 73479380, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.6328125, "step": 3401, "time_per_iteration": 2.439919948577881 }, { "auxiliary_loss_clip": 0.01091997, "auxiliary_loss_mlp": 0.01034112, "balance_loss_clip": 1.01454926, "balance_loss_mlp": 1.02668262, "epoch": 0.20453930557643168, "flos": 30522746833920.0, "grad_norm": 2.1437863458260242, "language_loss": 0.6956296, "learning_rate": 3.601442623939215e-06, "loss": 0.71689069, "num_input_tokens_seen": 73505105, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.65234375, "step": 3402, "time_per_iteration": 2.5202667713165283 }, { "auxiliary_loss_clip": 0.0109146, "auxiliary_loss_mlp": 0.01035624, "balance_loss_clip": 1.01727784, "balance_loss_mlp": 1.02655721, "epoch": 0.20459942882909965, "flos": 18986815353600.0, "grad_norm": 2.422100239714057, "language_loss": 0.80654657, "learning_rate": 3.6012162657944745e-06, "loss": 0.82781738, "num_input_tokens_seen": 73523700, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.6484375, "step": 3403, "time_per_iteration": 2.396238088607788 }, { "auxiliary_loss_clip": 0.01093829, "auxiliary_loss_mlp": 0.0103324, "balance_loss_clip": 1.01473844, "balance_loss_mlp": 1.02883208, "epoch": 0.20465955208176762, "flos": 20338023859200.0, "grad_norm": 1.8919767613263943, "language_loss": 0.82822037, "learning_rate": 3.600989850506444e-06, "loss": 0.84949106, "num_input_tokens_seen": 73542625, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.6484375, "step": 3404, "time_per_iteration": 2.3653664588928223 }, { "auxiliary_loss_clip": 0.01090939, "auxiliary_loss_mlp": 0.01041672, "balance_loss_clip": 1.02270603, "balance_loss_mlp": 1.02520812, "epoch": 0.20471967533443558, "flos": 21250234529280.0, "grad_norm": 1.863448038932841, "language_loss": 0.85795009, "learning_rate": 3.6007633780832043e-06, "loss": 0.87927622, "num_input_tokens_seen": 73561450, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.65625, "step": 3405, "time_per_iteration": 2.3978824615478516 }, { "auxiliary_loss_clip": 0.01089667, "auxiliary_loss_mlp": 0.01037807, "balance_loss_clip": 1.01919866, "balance_loss_mlp": 1.02511287, "epoch": 0.20477979858710357, "flos": 14500685514240.0, "grad_norm": 2.7106561337924093, "language_loss": 0.84639657, "learning_rate": 3.600536848532837e-06, "loss": 0.86767137, "num_input_tokens_seen": 73577155, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.6484375, "step": 3406, "time_per_iteration": 2.3337669372558594 }, { "auxiliary_loss_clip": 0.01089747, "auxiliary_loss_mlp": 0.01033084, "balance_loss_clip": 1.0159061, "balance_loss_mlp": 1.02772033, "epoch": 0.20483992183977154, "flos": 11399525331840.0, "grad_norm": 2.074774039957221, "language_loss": 0.67743528, "learning_rate": 3.600310261863427e-06, "loss": 0.69866359, "num_input_tokens_seen": 73594900, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.62109375, "step": 3407, "time_per_iteration": 2.3922691345214844 }, { "auxiliary_loss_clip": 0.0108831, "auxiliary_loss_mlp": 0.01034514, "balance_loss_clip": 1.01623893, "balance_loss_mlp": 1.02531064, "epoch": 0.2049000450924395, "flos": 19059329980800.0, "grad_norm": 2.0119760705765226, "language_loss": 0.84033918, "learning_rate": 3.6000836180830598e-06, "loss": 0.86156738, "num_input_tokens_seen": 73613810, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.6328125, "step": 3408, "time_per_iteration": 2.3717234134674072 }, { "auxiliary_loss_clip": 0.01091751, "auxiliary_loss_mlp": 0.0103593, "balance_loss_clip": 1.01751184, "balance_loss_mlp": 1.02712226, "epoch": 0.20496016834510747, "flos": 14573688900480.0, "grad_norm": 2.145279569849743, "language_loss": 0.63664538, "learning_rate": 3.5998569171998247e-06, "loss": 0.65792221, "num_input_tokens_seen": 73631495, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.6484375, "step": 3409, "time_per_iteration": 2.3818070888519287 }, { "auxiliary_loss_clip": 0.0108886, "auxiliary_loss_mlp": 0.01031594, "balance_loss_clip": 1.01383233, "balance_loss_mlp": 1.02462614, "epoch": 0.20502029159777543, "flos": 22125576936960.0, "grad_norm": 1.3643240352330017, "language_loss": 0.80532646, "learning_rate": 3.599630159221811e-06, "loss": 0.82653105, "num_input_tokens_seen": 73652840, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.640625, "step": 3410, "time_per_iteration": 2.3842222690582275 }, { "auxiliary_loss_clip": 0.01089651, "auxiliary_loss_mlp": 0.01036217, "balance_loss_clip": 1.01829982, "balance_loss_mlp": 1.02750182, "epoch": 0.2050804148504434, "flos": 25366913694720.0, "grad_norm": 3.1972839530835313, "language_loss": 0.75666493, "learning_rate": 3.599403344157112e-06, "loss": 0.77792358, "num_input_tokens_seen": 73672150, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.62109375, "step": 3411, "time_per_iteration": 3.8647632598876953 }, { "auxiliary_loss_clip": 0.0109096, "auxiliary_loss_mlp": 0.01042045, "balance_loss_clip": 1.0248549, "balance_loss_mlp": 1.0274868, "epoch": 0.2051405381031114, "flos": 23619126522240.0, "grad_norm": 1.7979701633073433, "language_loss": 0.73625255, "learning_rate": 3.5991764720138214e-06, "loss": 0.7575826, "num_input_tokens_seen": 73691940, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.6328125, "step": 3412, "time_per_iteration": 2.38908314704895 }, { "auxiliary_loss_clip": 0.01093034, "auxiliary_loss_mlp": 0.01045989, "balance_loss_clip": 1.02583075, "balance_loss_mlp": 1.02652621, "epoch": 0.20520066135577936, "flos": 19564732955520.0, "grad_norm": 2.359456746502695, "language_loss": 0.77695239, "learning_rate": 3.598949542800037e-06, "loss": 0.79834253, "num_input_tokens_seen": 73709080, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.6640625, "step": 3413, "time_per_iteration": 2.4109857082366943 }, { "auxiliary_loss_clip": 0.01088034, "auxiliary_loss_mlp": 0.01037268, "balance_loss_clip": 1.01910043, "balance_loss_mlp": 1.02752459, "epoch": 0.20526078460844732, "flos": 17675372753280.0, "grad_norm": 1.9421409032084236, "language_loss": 0.85001194, "learning_rate": 3.5987225565238556e-06, "loss": 0.87126493, "num_input_tokens_seen": 73727670, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.60546875, "step": 3414, "time_per_iteration": 3.822932481765747 }, { "auxiliary_loss_clip": 0.01089411, "auxiliary_loss_mlp": 0.0103071, "balance_loss_clip": 1.01269698, "balance_loss_mlp": 1.02587461, "epoch": 0.2053209078611153, "flos": 21499444880640.0, "grad_norm": 2.0015277424090763, "language_loss": 0.80799913, "learning_rate": 3.598495513193379e-06, "loss": 0.82920033, "num_input_tokens_seen": 73747170, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.6328125, "step": 3415, "time_per_iteration": 3.8386893272399902 }, { "auxiliary_loss_clip": 0.01088169, "auxiliary_loss_mlp": 0.01028838, "balance_loss_clip": 1.01270878, "balance_loss_mlp": 1.02676189, "epoch": 0.20538103111378325, "flos": 25662418375680.0, "grad_norm": 1.8200864009169986, "language_loss": 0.72592711, "learning_rate": 3.5982684128167093e-06, "loss": 0.74709719, "num_input_tokens_seen": 73767690, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.61328125, "step": 3416, "time_per_iteration": 2.4197278022766113 }, { "auxiliary_loss_clip": 0.0109024, "auxiliary_loss_mlp": 0.01032892, "balance_loss_clip": 1.01505804, "balance_loss_mlp": 1.02519274, "epoch": 0.20544115436645122, "flos": 23147833812480.0, "grad_norm": 1.9038989953330967, "language_loss": 0.78435564, "learning_rate": 3.598041255401951e-06, "loss": 0.80558705, "num_input_tokens_seen": 73786900, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.65234375, "step": 3417, "time_per_iteration": 3.752944231033325 }, { "auxiliary_loss_clip": 0.01091466, "auxiliary_loss_mlp": 0.01035336, "balance_loss_clip": 1.0164535, "balance_loss_mlp": 1.02768421, "epoch": 0.20550127761911918, "flos": 19389433685760.0, "grad_norm": 2.9929824267351792, "language_loss": 0.87379462, "learning_rate": 3.5978140409572105e-06, "loss": 0.89506263, "num_input_tokens_seen": 73804515, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.63671875, "step": 3418, "time_per_iteration": 2.3542520999908447 }, { "auxiliary_loss_clip": 0.01089245, "auxiliary_loss_mlp": 0.0103486, "balance_loss_clip": 1.01657367, "balance_loss_mlp": 1.02650845, "epoch": 0.20556140087178718, "flos": 22892025214080.0, "grad_norm": 2.0387380567678037, "language_loss": 0.62095773, "learning_rate": 3.597586769490598e-06, "loss": 0.6421988, "num_input_tokens_seen": 73822910, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.62890625, "step": 3419, "time_per_iteration": 2.4001595973968506 }, { "auxiliary_loss_clip": 0.01095027, "auxiliary_loss_mlp": 0.01039858, "balance_loss_clip": 1.01910341, "balance_loss_mlp": 1.0285635, "epoch": 0.20562152412445514, "flos": 19788700527360.0, "grad_norm": 1.7780668301721039, "language_loss": 0.86269796, "learning_rate": 3.5973594410102218e-06, "loss": 0.88404679, "num_input_tokens_seen": 73841160, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.6640625, "step": 3420, "time_per_iteration": 2.371304750442505 }, { "auxiliary_loss_clip": 0.01088989, "auxiliary_loss_mlp": 0.01032728, "balance_loss_clip": 1.01493001, "balance_loss_mlp": 1.0254879, "epoch": 0.2056816473771231, "flos": 31500699327360.0, "grad_norm": 3.131106011252814, "language_loss": 0.71428061, "learning_rate": 3.5971320555241967e-06, "loss": 0.73549777, "num_input_tokens_seen": 73862795, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.63671875, "step": 3421, "time_per_iteration": 2.532630681991577 }, { "auxiliary_loss_clip": 0.01088822, "auxiliary_loss_mlp": 0.01034526, "balance_loss_clip": 1.01706159, "balance_loss_mlp": 1.02615345, "epoch": 0.20574177062979107, "flos": 23257251613440.0, "grad_norm": 2.2017221536752407, "language_loss": 0.70898926, "learning_rate": 3.5969046130406376e-06, "loss": 0.7302227, "num_input_tokens_seen": 73881525, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.625, "step": 3422, "time_per_iteration": 2.376978874206543 }, { "auxiliary_loss_clip": 0.01023299, "auxiliary_loss_mlp": 0.01000564, "balance_loss_clip": 0.9985975, "balance_loss_mlp": 1.00740194, "epoch": 0.20580189388245904, "flos": 70309417777920.0, "grad_norm": 0.745883669525793, "language_loss": 0.55542767, "learning_rate": 3.5966771135676596e-06, "loss": 0.57566631, "num_input_tokens_seen": 73937775, "router_z_loss_clip": 0.01965332, "router_z_loss_mlp": 0.15820312, "step": 3423, "time_per_iteration": 3.106325387954712 }, { "auxiliary_loss_clip": 0.0109086, "auxiliary_loss_mlp": 0.01035106, "balance_loss_clip": 1.01633012, "balance_loss_mlp": 1.02702034, "epoch": 0.205862017135127, "flos": 30736520288640.0, "grad_norm": 1.8105349906243213, "language_loss": 0.71785295, "learning_rate": 3.5964495571133835e-06, "loss": 0.73911256, "num_input_tokens_seen": 73958250, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.640625, "step": 3424, "time_per_iteration": 2.444450855255127 }, { "auxiliary_loss_clip": 0.01089295, "auxiliary_loss_mlp": 0.01038805, "balance_loss_clip": 1.02225924, "balance_loss_mlp": 1.02796054, "epoch": 0.20592214038779497, "flos": 21323482295040.0, "grad_norm": 1.562763777237167, "language_loss": 0.75264859, "learning_rate": 3.596221943685928e-06, "loss": 0.7739296, "num_input_tokens_seen": 73977775, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.61328125, "step": 3425, "time_per_iteration": 2.4433488845825195 }, { "auxiliary_loss_clip": 0.01093009, "auxiliary_loss_mlp": 0.010466, "balance_loss_clip": 1.02914762, "balance_loss_mlp": 1.02995443, "epoch": 0.20598226364046296, "flos": 22890593848320.0, "grad_norm": 1.7784879169232835, "language_loss": 0.87846279, "learning_rate": 3.5959942732934184e-06, "loss": 0.89985889, "num_input_tokens_seen": 73996590, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.62890625, "step": 3426, "time_per_iteration": 2.3792903423309326 }, { "auxiliary_loss_clip": 0.01090758, "auxiliary_loss_mlp": 0.01034388, "balance_loss_clip": 1.01623249, "balance_loss_mlp": 1.02898359, "epoch": 0.20604238689313092, "flos": 23877413827200.0, "grad_norm": 1.4953182586608793, "language_loss": 0.76311988, "learning_rate": 3.595766545943978e-06, "loss": 0.78437138, "num_input_tokens_seen": 74015935, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.6171875, "step": 3427, "time_per_iteration": 2.434981346130371 }, { "auxiliary_loss_clip": 0.01090663, "auxiliary_loss_mlp": 0.01040024, "balance_loss_clip": 1.02062881, "balance_loss_mlp": 1.02673054, "epoch": 0.2061025101457989, "flos": 22490454222720.0, "grad_norm": 1.7734894470627613, "language_loss": 0.73887622, "learning_rate": 3.5955387616457347e-06, "loss": 0.7601831, "num_input_tokens_seen": 74036575, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.640625, "step": 3428, "time_per_iteration": 2.378920078277588 }, { "auxiliary_loss_clip": 0.01089111, "auxiliary_loss_mlp": 0.01038805, "balance_loss_clip": 1.01951647, "balance_loss_mlp": 1.02474248, "epoch": 0.20616263339846685, "flos": 22777964202240.0, "grad_norm": 1.6950960930414565, "language_loss": 0.73476946, "learning_rate": 3.5953109204068167e-06, "loss": 0.75604856, "num_input_tokens_seen": 74055365, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.64453125, "step": 3429, "time_per_iteration": 2.4017746448516846 }, { "auxiliary_loss_clip": 0.01096581, "auxiliary_loss_mlp": 0.01037211, "balance_loss_clip": 1.01843596, "balance_loss_mlp": 1.03064489, "epoch": 0.20622275665113482, "flos": 20881272614400.0, "grad_norm": 2.21737742663155, "language_loss": 0.84927869, "learning_rate": 3.5950830222353563e-06, "loss": 0.87061667, "num_input_tokens_seen": 74074875, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.65625, "step": 3430, "time_per_iteration": 2.3771278858184814 }, { "auxiliary_loss_clip": 0.01092067, "auxiliary_loss_mlp": 0.01036544, "balance_loss_clip": 1.01673174, "balance_loss_mlp": 1.02689767, "epoch": 0.20628287990380278, "flos": 19353403296000.0, "grad_norm": 4.316906960372921, "language_loss": 0.68970323, "learning_rate": 3.594855067139486e-06, "loss": 0.7109893, "num_input_tokens_seen": 74094505, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.65234375, "step": 3431, "time_per_iteration": 2.4158811569213867 }, { "auxiliary_loss_clip": 0.01021422, "auxiliary_loss_mlp": 0.01005794, "balance_loss_clip": 1.00362396, "balance_loss_mlp": 1.00544739, "epoch": 0.20634300315647078, "flos": 71514759156480.0, "grad_norm": 0.8058109401968928, "language_loss": 0.60227937, "learning_rate": 3.59462705512734e-06, "loss": 0.6225515, "num_input_tokens_seen": 74158500, "router_z_loss_clip": 0.02172852, "router_z_loss_mlp": 0.16015625, "step": 3432, "time_per_iteration": 3.1592984199523926 }, { "auxiliary_loss_clip": 0.0109354, "auxiliary_loss_mlp": 0.01042487, "balance_loss_clip": 1.02429545, "balance_loss_mlp": 1.02901411, "epoch": 0.20640312640913874, "flos": 21722923693440.0, "grad_norm": 1.5470741195994464, "language_loss": 0.72117704, "learning_rate": 3.594398986207056e-06, "loss": 0.74253732, "num_input_tokens_seen": 74176685, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.64453125, "step": 3433, "time_per_iteration": 2.407956600189209 }, { "auxiliary_loss_clip": 0.01092175, "auxiliary_loss_mlp": 0.01037297, "balance_loss_clip": 1.01973784, "balance_loss_mlp": 1.02718782, "epoch": 0.2064632496618067, "flos": 20553682527360.0, "grad_norm": 1.724537070049624, "language_loss": 0.86899883, "learning_rate": 3.5941708603867747e-06, "loss": 0.8902936, "num_input_tokens_seen": 74194935, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.6484375, "step": 3434, "time_per_iteration": 2.3818016052246094 }, { "auxiliary_loss_clip": 0.0109335, "auxiliary_loss_mlp": 0.01039946, "balance_loss_clip": 1.02058661, "balance_loss_mlp": 1.02697265, "epoch": 0.20652337291447467, "flos": 29822040380160.0, "grad_norm": 1.6386270787125943, "language_loss": 0.69367266, "learning_rate": 3.5939426776746356e-06, "loss": 0.71500558, "num_input_tokens_seen": 74215400, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.6640625, "step": 3435, "time_per_iteration": 2.4560468196868896 }, { "auxiliary_loss_clip": 0.01090102, "auxiliary_loss_mlp": 0.01038034, "balance_loss_clip": 1.01921082, "balance_loss_mlp": 1.02793837, "epoch": 0.20658349616714264, "flos": 26212439934720.0, "grad_norm": 2.479641386098983, "language_loss": 0.89462423, "learning_rate": 3.593714438078782e-06, "loss": 0.91590559, "num_input_tokens_seen": 74234090, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.62109375, "step": 3436, "time_per_iteration": 2.416600227355957 }, { "auxiliary_loss_clip": 0.01092537, "auxiliary_loss_mlp": 0.01032888, "balance_loss_clip": 1.01438689, "balance_loss_mlp": 1.02791631, "epoch": 0.2066436194198106, "flos": 25993185396480.0, "grad_norm": 1.8382826318204257, "language_loss": 0.76266444, "learning_rate": 3.59348614160736e-06, "loss": 0.78391868, "num_input_tokens_seen": 74253345, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.6484375, "step": 3437, "time_per_iteration": 2.427842378616333 }, { "auxiliary_loss_clip": 0.01090912, "auxiliary_loss_mlp": 0.0103481, "balance_loss_clip": 1.01736999, "balance_loss_mlp": 1.02654123, "epoch": 0.20670374267247857, "flos": 21360001443840.0, "grad_norm": 2.126482635828829, "language_loss": 0.77968448, "learning_rate": 3.5932577882685164e-06, "loss": 0.80094171, "num_input_tokens_seen": 74271615, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.64453125, "step": 3438, "time_per_iteration": 2.369384765625 }, { "auxiliary_loss_clip": 0.01020046, "auxiliary_loss_mlp": 0.01010954, "balance_loss_clip": 1.00886786, "balance_loss_mlp": 1.00374675, "epoch": 0.20676386592514656, "flos": 66379977567360.0, "grad_norm": 0.8477758193078978, "language_loss": 0.67162991, "learning_rate": 3.593029378070401e-06, "loss": 0.69193995, "num_input_tokens_seen": 74331390, "router_z_loss_clip": 0.02087402, "router_z_loss_mlp": 0.16308594, "step": 3439, "time_per_iteration": 2.9917290210723877 }, { "auxiliary_loss_clip": 0.01091483, "auxiliary_loss_mlp": 0.01031657, "balance_loss_clip": 1.01343036, "balance_loss_mlp": 1.02587008, "epoch": 0.20682398917781453, "flos": 17273627205120.0, "grad_norm": 2.236879316241981, "language_loss": 0.84296453, "learning_rate": 3.5928009110211646e-06, "loss": 0.86419594, "num_input_tokens_seen": 74347335, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.65625, "step": 3440, "time_per_iteration": 2.3410487174987793 }, { "auxiliary_loss_clip": 0.01092394, "auxiliary_loss_mlp": 0.01042595, "balance_loss_clip": 1.02478552, "balance_loss_mlp": 1.02785289, "epoch": 0.2068841124304825, "flos": 18076315340160.0, "grad_norm": 2.077953152278859, "language_loss": 0.84483582, "learning_rate": 3.592572387128961e-06, "loss": 0.86618572, "num_input_tokens_seen": 74366310, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.6484375, "step": 3441, "time_per_iteration": 2.353750705718994 }, { "auxiliary_loss_clip": 0.01091054, "auxiliary_loss_mlp": 0.01035021, "balance_loss_clip": 1.01772404, "balance_loss_mlp": 1.02873576, "epoch": 0.20694423568315046, "flos": 27345720533760.0, "grad_norm": 1.7365703515053852, "language_loss": 0.85869229, "learning_rate": 3.5923438064019457e-06, "loss": 0.87995303, "num_input_tokens_seen": 74387100, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.625, "step": 3442, "time_per_iteration": 2.439638614654541 }, { "auxiliary_loss_clip": 0.01096404, "auxiliary_loss_mlp": 0.01042477, "balance_loss_clip": 1.02298689, "balance_loss_mlp": 1.0300343, "epoch": 0.20700435893581842, "flos": 20228815526400.0, "grad_norm": 1.9196280044402672, "language_loss": 0.73328567, "learning_rate": 3.5921151688482754e-06, "loss": 0.75467443, "num_input_tokens_seen": 74404460, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.6640625, "step": 3443, "time_per_iteration": 2.372800827026367 }, { "auxiliary_loss_clip": 0.01091153, "auxiliary_loss_mlp": 0.01036531, "balance_loss_clip": 1.01932955, "balance_loss_mlp": 1.02771688, "epoch": 0.2070644821884864, "flos": 20630072315520.0, "grad_norm": 1.8510691069900311, "language_loss": 0.85541952, "learning_rate": 3.5918864744761106e-06, "loss": 0.87669635, "num_input_tokens_seen": 74423790, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.63671875, "step": 3444, "time_per_iteration": 2.381258249282837 }, { "auxiliary_loss_clip": 0.01018973, "auxiliary_loss_mlp": 0.0100074, "balance_loss_clip": 0.99864203, "balance_loss_mlp": 1.00311446, "epoch": 0.20712460544115438, "flos": 65937907532160.0, "grad_norm": 0.6898303371855926, "language_loss": 0.57121408, "learning_rate": 3.5916577232936116e-06, "loss": 0.59141123, "num_input_tokens_seen": 74488130, "router_z_loss_clip": 0.02099609, "router_z_loss_mlp": 0.15820312, "step": 3445, "time_per_iteration": 2.9848201274871826 }, { "auxiliary_loss_clip": 0.01090997, "auxiliary_loss_mlp": 0.01038907, "balance_loss_clip": 1.0204649, "balance_loss_mlp": 1.02755046, "epoch": 0.20718472869382235, "flos": 19424765848320.0, "grad_norm": 1.4452724688645875, "language_loss": 0.78202057, "learning_rate": 3.591428915308944e-06, "loss": 0.80331963, "num_input_tokens_seen": 74506720, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.6328125, "step": 3446, "time_per_iteration": 2.362121105194092 }, { "auxiliary_loss_clip": 0.01096816, "auxiliary_loss_mlp": 0.01041338, "balance_loss_clip": 1.01996422, "balance_loss_mlp": 1.02874351, "epoch": 0.2072448519464903, "flos": 24497890243200.0, "grad_norm": 2.1383938563221787, "language_loss": 0.62538004, "learning_rate": 3.5912000505302706e-06, "loss": 0.64676166, "num_input_tokens_seen": 74525330, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 0.6796875, "step": 3447, "time_per_iteration": 2.4085800647735596 }, { "auxiliary_loss_clip": 0.01094221, "auxiliary_loss_mlp": 0.010403, "balance_loss_clip": 1.02170324, "balance_loss_mlp": 1.02841926, "epoch": 0.20730497519915828, "flos": 23074586046720.0, "grad_norm": 1.8454981068552492, "language_loss": 0.85958946, "learning_rate": 3.590971128965761e-06, "loss": 0.88093472, "num_input_tokens_seen": 74544535, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.65625, "step": 3448, "time_per_iteration": 2.3759377002716064 }, { "auxiliary_loss_clip": 0.01092135, "auxiliary_loss_mlp": 0.01041375, "balance_loss_clip": 1.02066863, "balance_loss_mlp": 1.02737164, "epoch": 0.20736509845182624, "flos": 21067987898880.0, "grad_norm": 2.0909084611212125, "language_loss": 0.75326729, "learning_rate": 3.5907421506235844e-06, "loss": 0.77460241, "num_input_tokens_seen": 74562300, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.6484375, "step": 3449, "time_per_iteration": 2.3815219402313232 }, { "auxiliary_loss_clip": 0.01094984, "auxiliary_loss_mlp": 0.01040994, "balance_loss_clip": 1.02082419, "balance_loss_mlp": 1.0273211, "epoch": 0.2074252217044942, "flos": 17632499736960.0, "grad_norm": 1.9843231891096198, "language_loss": 0.76733154, "learning_rate": 3.5905131155119124e-06, "loss": 0.78869134, "num_input_tokens_seen": 74580080, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.67578125, "step": 3450, "time_per_iteration": 2.3549654483795166 }, { "auxiliary_loss_clip": 0.01095435, "auxiliary_loss_mlp": 0.01035646, "balance_loss_clip": 1.01689434, "balance_loss_mlp": 1.02833676, "epoch": 0.20748534495716217, "flos": 23545948579200.0, "grad_norm": 1.6702905135861918, "language_loss": 0.82409501, "learning_rate": 3.590284023638918e-06, "loss": 0.84540582, "num_input_tokens_seen": 74598980, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.671875, "step": 3451, "time_per_iteration": 3.779995918273926 }, { "auxiliary_loss_clip": 0.01019154, "auxiliary_loss_mlp": 0.01001525, "balance_loss_clip": 0.99953437, "balance_loss_mlp": 1.0033164, "epoch": 0.20754546820983016, "flos": 52250313738240.0, "grad_norm": 1.2365319613705386, "language_loss": 0.56629205, "learning_rate": 3.5900548750127784e-06, "loss": 0.58649886, "num_input_tokens_seen": 74655275, "router_z_loss_clip": 0.01989746, "router_z_loss_mlp": 0.15820312, "step": 3452, "time_per_iteration": 2.9152235984802246 }, { "auxiliary_loss_clip": 0.01096062, "auxiliary_loss_mlp": 0.01043294, "balance_loss_clip": 1.02307582, "balance_loss_mlp": 1.02723455, "epoch": 0.20760559146249813, "flos": 20411341447680.0, "grad_norm": 1.8644726447354187, "language_loss": 0.87885737, "learning_rate": 3.5898256696416704e-06, "loss": 0.90025091, "num_input_tokens_seen": 74674560, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.6875, "step": 3453, "time_per_iteration": 2.374065637588501 }, { "auxiliary_loss_clip": 0.01093974, "auxiliary_loss_mlp": 0.01034654, "balance_loss_clip": 1.01370907, "balance_loss_mlp": 1.02928376, "epoch": 0.2076657147151661, "flos": 23184876631680.0, "grad_norm": 4.938075755184813, "language_loss": 0.80229759, "learning_rate": 3.589596407533773e-06, "loss": 0.82358384, "num_input_tokens_seen": 74694500, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.6484375, "step": 3454, "time_per_iteration": 5.1431028842926025 }, { "auxiliary_loss_clip": 0.01093993, "auxiliary_loss_mlp": 0.01042325, "balance_loss_clip": 1.0225246, "balance_loss_mlp": 1.02817988, "epoch": 0.20772583796783406, "flos": 18292323121920.0, "grad_norm": 2.81641100334554, "language_loss": 0.76740152, "learning_rate": 3.589367088697269e-06, "loss": 0.78876472, "num_input_tokens_seen": 74710485, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.65625, "step": 3455, "time_per_iteration": 2.3422646522521973 }, { "auxiliary_loss_clip": 0.01091646, "auxiliary_loss_mlp": 0.01035814, "balance_loss_clip": 1.017205, "balance_loss_mlp": 1.02829957, "epoch": 0.20778596122050202, "flos": 17601845696640.0, "grad_norm": 2.3858522831270363, "language_loss": 0.80792063, "learning_rate": 3.5891377131403423e-06, "loss": 0.8291952, "num_input_tokens_seen": 74727450, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.6328125, "step": 3456, "time_per_iteration": 2.3777241706848145 }, { "auxiliary_loss_clip": 0.01096006, "auxiliary_loss_mlp": 0.01036075, "balance_loss_clip": 1.01687002, "balance_loss_mlp": 1.02885818, "epoch": 0.20784608447317, "flos": 23804445352320.0, "grad_norm": 1.4983478884464172, "language_loss": 0.77782631, "learning_rate": 3.5889082808711776e-06, "loss": 0.79914713, "num_input_tokens_seen": 74746725, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.671875, "step": 3457, "time_per_iteration": 3.7937073707580566 }, { "auxiliary_loss_clip": 0.01095138, "auxiliary_loss_mlp": 0.01040004, "balance_loss_clip": 1.0195601, "balance_loss_mlp": 1.02762175, "epoch": 0.20790620772583795, "flos": 17638329934080.0, "grad_norm": 1.7504821464077382, "language_loss": 0.83592123, "learning_rate": 3.5886787918979645e-06, "loss": 0.85727262, "num_input_tokens_seen": 74765255, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.67578125, "step": 3458, "time_per_iteration": 2.3449764251708984 }, { "auxiliary_loss_clip": 0.01090537, "auxiliary_loss_mlp": 0.01033334, "balance_loss_clip": 1.01442719, "balance_loss_mlp": 1.02720952, "epoch": 0.20796633097850595, "flos": 27672228368640.0, "grad_norm": 1.650509758721497, "language_loss": 0.76041085, "learning_rate": 3.588449246228891e-06, "loss": 0.78164959, "num_input_tokens_seen": 74785710, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.6328125, "step": 3459, "time_per_iteration": 2.4398462772369385 }, { "auxiliary_loss_clip": 0.01087872, "auxiliary_loss_mlp": 0.0103317, "balance_loss_clip": 1.0155623, "balance_loss_mlp": 1.02618527, "epoch": 0.2080264542311739, "flos": 19244578988160.0, "grad_norm": 2.161015419600236, "language_loss": 0.76956034, "learning_rate": 3.5882196438721504e-06, "loss": 0.79077077, "num_input_tokens_seen": 74804490, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.6171875, "step": 3460, "time_per_iteration": 2.3799078464508057 }, { "auxiliary_loss_clip": 0.01095987, "auxiliary_loss_mlp": 0.0104126, "balance_loss_clip": 1.02234113, "balance_loss_mlp": 1.02936792, "epoch": 0.20808657748384188, "flos": 27524720407680.0, "grad_norm": 1.7015066210378609, "language_loss": 0.75361431, "learning_rate": 3.5879899848359367e-06, "loss": 0.7749868, "num_input_tokens_seen": 74826340, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.66796875, "step": 3461, "time_per_iteration": 2.442997694015503 }, { "auxiliary_loss_clip": 0.01093002, "auxiliary_loss_mlp": 0.01037101, "balance_loss_clip": 1.01685953, "balance_loss_mlp": 1.02591705, "epoch": 0.20814670073650984, "flos": 26905710268800.0, "grad_norm": 3.551495866186673, "language_loss": 0.88157642, "learning_rate": 3.587760269128444e-06, "loss": 0.90287745, "num_input_tokens_seen": 74844960, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.671875, "step": 3462, "time_per_iteration": 2.41481614112854 }, { "auxiliary_loss_clip": 0.01017315, "auxiliary_loss_mlp": 0.01001502, "balance_loss_clip": 0.99936807, "balance_loss_mlp": 1.00202131, "epoch": 0.2082068239891778, "flos": 70172383224960.0, "grad_norm": 0.7527823553943578, "language_loss": 0.58998442, "learning_rate": 3.587530496757872e-06, "loss": 0.61017263, "num_input_tokens_seen": 74909075, "router_z_loss_clip": 0.0213623, "router_z_loss_mlp": 0.15234375, "step": 3463, "time_per_iteration": 3.123060941696167 }, { "auxiliary_loss_clip": 0.01092986, "auxiliary_loss_mlp": 0.0104451, "balance_loss_clip": 1.02433991, "balance_loss_mlp": 1.02627039, "epoch": 0.20826694724184577, "flos": 24606924019200.0, "grad_norm": 2.303712602197063, "language_loss": 0.66127217, "learning_rate": 3.5873006677324204e-06, "loss": 0.68264711, "num_input_tokens_seen": 74928125, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.66796875, "step": 3464, "time_per_iteration": 2.4263315200805664 }, { "auxiliary_loss_clip": 0.01096491, "auxiliary_loss_mlp": 0.0103888, "balance_loss_clip": 1.01911533, "balance_loss_mlp": 1.02857804, "epoch": 0.20832707049451377, "flos": 12892097399040.0, "grad_norm": 1.8989058898156979, "language_loss": 0.83964658, "learning_rate": 3.587070782060291e-06, "loss": 0.86100036, "num_input_tokens_seen": 74945090, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.6796875, "step": 3465, "time_per_iteration": 2.366279125213623 }, { "auxiliary_loss_clip": 0.01094326, "auxiliary_loss_mlp": 0.01037246, "balance_loss_clip": 1.018327, "balance_loss_mlp": 1.02799881, "epoch": 0.20838719374718173, "flos": 22197777361920.0, "grad_norm": 2.5378212427357774, "language_loss": 0.81736517, "learning_rate": 3.5868408397496874e-06, "loss": 0.83868092, "num_input_tokens_seen": 74963630, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.6640625, "step": 3466, "time_per_iteration": 2.37129807472229 }, { "auxiliary_loss_clip": 0.01092957, "auxiliary_loss_mlp": 0.01034349, "balance_loss_clip": 1.01684952, "balance_loss_mlp": 1.02911043, "epoch": 0.2084473169998497, "flos": 15157750901760.0, "grad_norm": 1.750566358385547, "language_loss": 0.81937397, "learning_rate": 3.5866108408088166e-06, "loss": 0.84064704, "num_input_tokens_seen": 74981875, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.640625, "step": 3467, "time_per_iteration": 2.3644158840179443 }, { "auxiliary_loss_clip": 0.01089734, "auxiliary_loss_mlp": 0.01037296, "balance_loss_clip": 1.0203985, "balance_loss_mlp": 1.02880931, "epoch": 0.20850744025251766, "flos": 17455838924160.0, "grad_norm": 2.1968935159291356, "language_loss": 0.81883782, "learning_rate": 3.5863807852458858e-06, "loss": 0.8401081, "num_input_tokens_seen": 74999155, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.609375, "step": 3468, "time_per_iteration": 2.355436325073242 }, { "auxiliary_loss_clip": 0.01095281, "auxiliary_loss_mlp": 0.01041346, "balance_loss_clip": 1.01943529, "balance_loss_mlp": 1.02690184, "epoch": 0.20856756350518563, "flos": 25697890183680.0, "grad_norm": 1.9899682280664843, "language_loss": 0.89986277, "learning_rate": 3.5861506730691054e-06, "loss": 0.921229, "num_input_tokens_seen": 75017850, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 0.68359375, "step": 3469, "time_per_iteration": 2.412328004837036 }, { "auxiliary_loss_clip": 0.01095278, "auxiliary_loss_mlp": 0.01031567, "balance_loss_clip": 1.01392412, "balance_loss_mlp": 1.03051162, "epoch": 0.2086276867578536, "flos": 37887535560960.0, "grad_norm": 1.9551924636038538, "language_loss": 0.76771545, "learning_rate": 3.5859205042866877e-06, "loss": 0.78898388, "num_input_tokens_seen": 75039270, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.6484375, "step": 3470, "time_per_iteration": 2.532867193222046 }, { "auxiliary_loss_clip": 0.01091899, "auxiliary_loss_mlp": 0.01033378, "balance_loss_clip": 1.0141381, "balance_loss_mlp": 1.02850223, "epoch": 0.20868781001052156, "flos": 25555863306240.0, "grad_norm": 4.5420928354475, "language_loss": 0.7589848, "learning_rate": 3.5856902789068465e-06, "loss": 0.78023756, "num_input_tokens_seen": 75059350, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.63671875, "step": 3471, "time_per_iteration": 2.42753005027771 }, { "auxiliary_loss_clip": 0.01096855, "auxiliary_loss_mlp": 0.01040165, "balance_loss_clip": 1.0192678, "balance_loss_mlp": 1.02707171, "epoch": 0.20874793326318955, "flos": 27527897341440.0, "grad_norm": 1.698707599750462, "language_loss": 0.75870049, "learning_rate": 3.585459996937798e-06, "loss": 0.78007072, "num_input_tokens_seen": 75080150, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.6953125, "step": 3472, "time_per_iteration": 2.454162836074829 }, { "auxiliary_loss_clip": 0.01093067, "auxiliary_loss_mlp": 0.01032191, "balance_loss_clip": 1.01401162, "balance_loss_mlp": 1.02869606, "epoch": 0.20880805651585752, "flos": 18547887340800.0, "grad_norm": 2.029897599746445, "language_loss": 0.84425724, "learning_rate": 3.585229658387761e-06, "loss": 0.86550981, "num_input_tokens_seen": 75097920, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.640625, "step": 3473, "time_per_iteration": 2.374516487121582 }, { "auxiliary_loss_clip": 0.01022572, "auxiliary_loss_mlp": 0.01001798, "balance_loss_clip": 0.99985516, "balance_loss_mlp": 1.00716949, "epoch": 0.20886817976852548, "flos": 65943318792960.0, "grad_norm": 0.891368913184291, "language_loss": 0.63656253, "learning_rate": 3.5849992632649552e-06, "loss": 0.65680623, "num_input_tokens_seen": 75152410, "router_z_loss_clip": 0.01940918, "router_z_loss_mlp": 0.15429688, "step": 3474, "time_per_iteration": 2.9225313663482666 }, { "auxiliary_loss_clip": 0.01096738, "auxiliary_loss_mlp": 0.01040183, "balance_loss_clip": 1.02053738, "balance_loss_mlp": 1.03014934, "epoch": 0.20892830302119345, "flos": 36537688598400.0, "grad_norm": 2.0460801325355913, "language_loss": 0.69748187, "learning_rate": 3.5847688115776024e-06, "loss": 0.71885109, "num_input_tokens_seen": 75173265, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.6640625, "step": 3475, "time_per_iteration": 2.5197010040283203 }, { "auxiliary_loss_clip": 0.01094102, "auxiliary_loss_mlp": 0.0103653, "balance_loss_clip": 1.0175643, "balance_loss_mlp": 1.02916694, "epoch": 0.2089884262738614, "flos": 20955777189120.0, "grad_norm": 1.5004689408877798, "language_loss": 0.70123768, "learning_rate": 3.5845383033339274e-06, "loss": 0.72254401, "num_input_tokens_seen": 75193640, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.6484375, "step": 3476, "time_per_iteration": 2.383960723876953 }, { "auxiliary_loss_clip": 0.01092594, "auxiliary_loss_mlp": 0.01034302, "balance_loss_clip": 1.01690972, "balance_loss_mlp": 1.02826881, "epoch": 0.20904854952652938, "flos": 22782921615360.0, "grad_norm": 2.0210394947761574, "language_loss": 0.89149666, "learning_rate": 3.584307738542156e-06, "loss": 0.91276556, "num_input_tokens_seen": 75212545, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.640625, "step": 3477, "time_per_iteration": 2.394998550415039 }, { "auxiliary_loss_clip": 0.01092902, "auxiliary_loss_mlp": 0.01034753, "balance_loss_clip": 1.01601338, "balance_loss_mlp": 1.02812278, "epoch": 0.20910867277919734, "flos": 27302184201600.0, "grad_norm": 2.0467141777077686, "language_loss": 0.67719901, "learning_rate": 3.5840771172105174e-06, "loss": 0.6984756, "num_input_tokens_seen": 75230865, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.6484375, "step": 3478, "time_per_iteration": 2.425455331802368 }, { "auxiliary_loss_clip": 0.01093302, "auxiliary_loss_mlp": 0.01039866, "balance_loss_clip": 1.02109003, "balance_loss_mlp": 1.02950048, "epoch": 0.20916879603186533, "flos": 14318369061120.0, "grad_norm": 2.151720096034742, "language_loss": 0.84817231, "learning_rate": 3.5838464393472406e-06, "loss": 0.86950397, "num_input_tokens_seen": 75248285, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.63671875, "step": 3479, "time_per_iteration": 2.3746531009674072 }, { "auxiliary_loss_clip": 0.01094081, "auxiliary_loss_mlp": 0.01034833, "balance_loss_clip": 1.01621294, "balance_loss_mlp": 1.0284301, "epoch": 0.2092289192845333, "flos": 22271932823040.0, "grad_norm": 2.8047810504700625, "language_loss": 0.73754495, "learning_rate": 3.5836157049605587e-06, "loss": 0.75883412, "num_input_tokens_seen": 75266310, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.65625, "step": 3480, "time_per_iteration": 2.388402223587036 }, { "auxiliary_loss_clip": 0.01091454, "auxiliary_loss_mlp": 0.01035426, "balance_loss_clip": 1.01849866, "balance_loss_mlp": 1.02858162, "epoch": 0.20928904253720126, "flos": 14829811701120.0, "grad_norm": 2.1618652065751394, "language_loss": 0.75717729, "learning_rate": 3.5833849140587057e-06, "loss": 0.77844608, "num_input_tokens_seen": 75284175, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.62890625, "step": 3481, "time_per_iteration": 2.3657357692718506 }, { "auxiliary_loss_clip": 0.01093397, "auxiliary_loss_mlp": 0.01040802, "balance_loss_clip": 1.02238441, "balance_loss_mlp": 1.02931952, "epoch": 0.20934916578986923, "flos": 23258019663360.0, "grad_norm": 2.4154735349596668, "language_loss": 0.85249126, "learning_rate": 3.583154066649918e-06, "loss": 0.8738333, "num_input_tokens_seen": 75303465, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.640625, "step": 3482, "time_per_iteration": 2.4067273139953613 }, { "auxiliary_loss_clip": 0.01094505, "auxiliary_loss_mlp": 0.01035863, "balance_loss_clip": 1.016289, "balance_loss_mlp": 1.02950621, "epoch": 0.2094092890425372, "flos": 32013049662720.0, "grad_norm": 5.260844561923305, "language_loss": 0.71030521, "learning_rate": 3.5829231627424345e-06, "loss": 0.73160881, "num_input_tokens_seen": 75325290, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.6484375, "step": 3483, "time_per_iteration": 2.4645891189575195 }, { "auxiliary_loss_clip": 0.01093589, "auxiliary_loss_mlp": 0.01041059, "balance_loss_clip": 1.02218843, "balance_loss_mlp": 1.02667093, "epoch": 0.20946941229520516, "flos": 20009630810880.0, "grad_norm": 1.5227581604886158, "language_loss": 0.75268054, "learning_rate": 3.5826922023444945e-06, "loss": 0.77402705, "num_input_tokens_seen": 75343895, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.66796875, "step": 3484, "time_per_iteration": 2.4082491397857666 }, { "auxiliary_loss_clip": 0.01093195, "auxiliary_loss_mlp": 0.01035365, "balance_loss_clip": 1.01635146, "balance_loss_mlp": 1.02883124, "epoch": 0.20952953554787315, "flos": 30738684792960.0, "grad_norm": 1.5972834679782852, "language_loss": 0.70754176, "learning_rate": 3.582461185464342e-06, "loss": 0.72882736, "num_input_tokens_seen": 75367100, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.64453125, "step": 3485, "time_per_iteration": 2.4705119132995605 }, { "auxiliary_loss_clip": 0.01095458, "auxiliary_loss_mlp": 0.01036053, "balance_loss_clip": 1.01784933, "balance_loss_mlp": 1.03004837, "epoch": 0.20958965880054112, "flos": 27048086259840.0, "grad_norm": 2.164433217911864, "language_loss": 0.83064806, "learning_rate": 3.5822301121102195e-06, "loss": 0.8519631, "num_input_tokens_seen": 75389925, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.65625, "step": 3486, "time_per_iteration": 2.4376449584960938 }, { "auxiliary_loss_clip": 0.01094113, "auxiliary_loss_mlp": 0.01038986, "balance_loss_clip": 1.02024603, "balance_loss_mlp": 1.02828133, "epoch": 0.20964978205320908, "flos": 34202697402240.0, "grad_norm": 1.6871712925684774, "language_loss": 0.8739146, "learning_rate": 3.5819989822903744e-06, "loss": 0.89524567, "num_input_tokens_seen": 75408575, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.65625, "step": 3487, "time_per_iteration": 2.474391460418701 }, { "auxiliary_loss_clip": 0.01092124, "auxiliary_loss_mlp": 0.01039022, "balance_loss_clip": 1.02004361, "balance_loss_mlp": 1.02856433, "epoch": 0.20970990530587705, "flos": 23476261772160.0, "grad_norm": 2.695113070981032, "language_loss": 0.72291046, "learning_rate": 3.5817677960130547e-06, "loss": 0.74422193, "num_input_tokens_seen": 75427155, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.63671875, "step": 3488, "time_per_iteration": 2.3911118507385254 }, { "auxiliary_loss_clip": 0.01092648, "auxiliary_loss_mlp": 0.01034636, "balance_loss_clip": 1.0165875, "balance_loss_mlp": 1.02789044, "epoch": 0.209770028558545, "flos": 18550470781440.0, "grad_norm": 2.7849653661817775, "language_loss": 0.81004465, "learning_rate": 3.5815365532865113e-06, "loss": 0.83131742, "num_input_tokens_seen": 75444450, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.6484375, "step": 3489, "time_per_iteration": 2.354342460632324 }, { "auxiliary_loss_clip": 0.01091426, "auxiliary_loss_mlp": 0.01036575, "balance_loss_clip": 1.0183357, "balance_loss_mlp": 1.02779531, "epoch": 0.20983015181121298, "flos": 21615914776320.0, "grad_norm": 1.74949974030656, "language_loss": 0.73232079, "learning_rate": 3.5813052541189972e-06, "loss": 0.75360084, "num_input_tokens_seen": 75462625, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.63671875, "step": 3490, "time_per_iteration": 3.758084535598755 }, { "auxiliary_loss_clip": 0.01088927, "auxiliary_loss_mlp": 0.01039904, "balance_loss_clip": 1.02260721, "balance_loss_mlp": 1.02791572, "epoch": 0.20989027506388094, "flos": 16613873642880.0, "grad_norm": 1.7351063103210318, "language_loss": 0.70122451, "learning_rate": 3.581073898518766e-06, "loss": 0.72251278, "num_input_tokens_seen": 75480640, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.609375, "step": 3491, "time_per_iteration": 2.357231378555298 }, { "auxiliary_loss_clip": 0.01091895, "auxiliary_loss_mlp": 0.01040048, "balance_loss_clip": 1.02015138, "balance_loss_mlp": 1.0268507, "epoch": 0.20995039831654894, "flos": 23215844874240.0, "grad_norm": 2.4201866668159187, "language_loss": 0.7964893, "learning_rate": 3.5808424864940737e-06, "loss": 0.81780875, "num_input_tokens_seen": 75494900, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.65234375, "step": 3492, "time_per_iteration": 2.3604140281677246 }, { "auxiliary_loss_clip": 0.01092678, "auxiliary_loss_mlp": 0.01037695, "balance_loss_clip": 1.01931262, "balance_loss_mlp": 1.02888894, "epoch": 0.2100105215692169, "flos": 18146595640320.0, "grad_norm": 2.5281521179956346, "language_loss": 0.86871386, "learning_rate": 3.5806110180531797e-06, "loss": 0.89001751, "num_input_tokens_seen": 75513370, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.63671875, "step": 3493, "time_per_iteration": 3.7610621452331543 }, { "auxiliary_loss_clip": 0.0108737, "auxiliary_loss_mlp": 0.01032863, "balance_loss_clip": 1.01510119, "balance_loss_mlp": 1.02624381, "epoch": 0.21007064482188487, "flos": 15960683416320.0, "grad_norm": 1.8347118331738057, "language_loss": 0.69150156, "learning_rate": 3.5803794932043447e-06, "loss": 0.71270388, "num_input_tokens_seen": 75532480, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.609375, "step": 3494, "time_per_iteration": 3.748657703399658 }, { "auxiliary_loss_clip": 0.01096434, "auxiliary_loss_mlp": 0.01030427, "balance_loss_clip": 1.0126884, "balance_loss_mlp": 1.02939677, "epoch": 0.21013076807455283, "flos": 32232932605440.0, "grad_norm": 1.7145772598145161, "language_loss": 0.78903693, "learning_rate": 3.58014791195583e-06, "loss": 0.81030554, "num_input_tokens_seen": 75552745, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.671875, "step": 3495, "time_per_iteration": 2.453927993774414 }, { "auxiliary_loss_clip": 0.01090338, "auxiliary_loss_mlp": 0.01032527, "balance_loss_clip": 1.01542044, "balance_loss_mlp": 1.02628589, "epoch": 0.2101908913272208, "flos": 23695481399040.0, "grad_norm": 2.2656349365308595, "language_loss": 0.77215421, "learning_rate": 3.579916274315902e-06, "loss": 0.79338288, "num_input_tokens_seen": 75574355, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.640625, "step": 3496, "time_per_iteration": 3.774770975112915 }, { "auxiliary_loss_clip": 0.01092697, "auxiliary_loss_mlp": 0.01046259, "balance_loss_clip": 1.02656555, "balance_loss_mlp": 1.02746975, "epoch": 0.21025101457988876, "flos": 20374752476160.0, "grad_norm": 2.149596692604933, "language_loss": 0.82383239, "learning_rate": 3.5796845802928254e-06, "loss": 0.84522194, "num_input_tokens_seen": 75592215, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.65234375, "step": 3497, "time_per_iteration": 2.397838830947876 }, { "auxiliary_loss_clip": 0.01093279, "auxiliary_loss_mlp": 0.01036384, "balance_loss_clip": 1.01709604, "balance_loss_mlp": 1.02829242, "epoch": 0.21031113783255675, "flos": 25774454528640.0, "grad_norm": 1.9210431419349947, "language_loss": 0.67437673, "learning_rate": 3.5794528298948696e-06, "loss": 0.69567335, "num_input_tokens_seen": 75610740, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.6484375, "step": 3498, "time_per_iteration": 2.430166721343994 }, { "auxiliary_loss_clip": 0.01092432, "auxiliary_loss_mlp": 0.01035971, "balance_loss_clip": 1.01723123, "balance_loss_mlp": 1.0269568, "epoch": 0.21037126108522472, "flos": 22017101742720.0, "grad_norm": 2.40011080587237, "language_loss": 0.80467963, "learning_rate": 3.579221023130306e-06, "loss": 0.82596362, "num_input_tokens_seen": 75631005, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.65234375, "step": 3499, "time_per_iteration": 2.386989116668701 }, { "auxiliary_loss_clip": 0.01091396, "auxiliary_loss_mlp": 0.01036061, "balance_loss_clip": 1.01844192, "balance_loss_mlp": 1.02774572, "epoch": 0.21043138433789269, "flos": 25333327100160.0, "grad_norm": 2.0378850646674143, "language_loss": 0.78460246, "learning_rate": 3.578989160007405e-06, "loss": 0.80587709, "num_input_tokens_seen": 75650655, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.63671875, "step": 3500, "time_per_iteration": 2.434589385986328 }, { "auxiliary_loss_clip": 0.01091267, "auxiliary_loss_mlp": 0.01037002, "balance_loss_clip": 1.01852453, "balance_loss_mlp": 1.02694094, "epoch": 0.21049150759056065, "flos": 25555479281280.0, "grad_norm": 3.590639146678211, "language_loss": 0.73696578, "learning_rate": 3.5787572405344437e-06, "loss": 0.75824845, "num_input_tokens_seen": 75669895, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.640625, "step": 3501, "time_per_iteration": 2.405604124069214 }, { "auxiliary_loss_clip": 0.01088548, "auxiliary_loss_mlp": 0.01038961, "balance_loss_clip": 1.02068686, "balance_loss_mlp": 1.02602112, "epoch": 0.21055163084322862, "flos": 24494538752640.0, "grad_norm": 1.4629088850255327, "language_loss": 0.75636578, "learning_rate": 3.578525264719697e-06, "loss": 0.77764094, "num_input_tokens_seen": 75689535, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.625, "step": 3502, "time_per_iteration": 2.4369428157806396 }, { "auxiliary_loss_clip": 0.01091219, "auxiliary_loss_mlp": 0.01035417, "balance_loss_clip": 1.01767862, "balance_loss_mlp": 1.02838397, "epoch": 0.21061175409589658, "flos": 25737865557120.0, "grad_norm": 1.859324534822765, "language_loss": 0.77522284, "learning_rate": 3.578293232571444e-06, "loss": 0.79648918, "num_input_tokens_seen": 75709265, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.625, "step": 3503, "time_per_iteration": 2.4071285724639893 }, { "auxiliary_loss_clip": 0.01095094, "auxiliary_loss_mlp": 0.01042293, "balance_loss_clip": 1.02020359, "balance_loss_mlp": 1.02660537, "epoch": 0.21067187734856455, "flos": 18988176896640.0, "grad_norm": 2.2934284972015995, "language_loss": 0.7852217, "learning_rate": 3.5780611440979655e-06, "loss": 0.80659562, "num_input_tokens_seen": 75727050, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 0.68359375, "step": 3504, "time_per_iteration": 2.367966413497925 }, { "auxiliary_loss_clip": 0.01096176, "auxiliary_loss_mlp": 0.010381, "balance_loss_clip": 1.01759577, "balance_loss_mlp": 1.02878189, "epoch": 0.21073200060123254, "flos": 24680206696320.0, "grad_norm": 1.8603292502709545, "language_loss": 0.7668277, "learning_rate": 3.5778289993075442e-06, "loss": 0.78817046, "num_input_tokens_seen": 75747175, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.671875, "step": 3505, "time_per_iteration": 2.4021263122558594 }, { "auxiliary_loss_clip": 0.01089215, "auxiliary_loss_mlp": 0.0104564, "balance_loss_clip": 1.02707958, "balance_loss_mlp": 1.02682805, "epoch": 0.2107921238539005, "flos": 28548059535360.0, "grad_norm": 1.901215270424609, "language_loss": 0.63815582, "learning_rate": 3.5775967982084644e-06, "loss": 0.65950441, "num_input_tokens_seen": 75767690, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.625, "step": 3506, "time_per_iteration": 2.438072443008423 }, { "auxiliary_loss_clip": 0.01091591, "auxiliary_loss_mlp": 0.01036945, "balance_loss_clip": 1.01754928, "balance_loss_mlp": 1.02711058, "epoch": 0.21085224710656847, "flos": 25884640379520.0, "grad_norm": 1.602578378639749, "language_loss": 0.82088757, "learning_rate": 3.5773645408090126e-06, "loss": 0.84217298, "num_input_tokens_seen": 75787255, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.640625, "step": 3507, "time_per_iteration": 2.4251205921173096 }, { "auxiliary_loss_clip": 0.01092175, "auxiliary_loss_mlp": 0.01033608, "balance_loss_clip": 1.01497543, "balance_loss_mlp": 1.02826834, "epoch": 0.21091237035923643, "flos": 14975399537280.0, "grad_norm": 1.8195482504015763, "language_loss": 0.75703776, "learning_rate": 3.577132227117478e-06, "loss": 0.77829552, "num_input_tokens_seen": 75805890, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.640625, "step": 3508, "time_per_iteration": 2.3597803115844727 }, { "auxiliary_loss_clip": 0.01095042, "auxiliary_loss_mlp": 0.01036924, "balance_loss_clip": 1.01726651, "balance_loss_mlp": 1.02855814, "epoch": 0.2109724936119044, "flos": 16361591091840.0, "grad_norm": 2.685959813157059, "language_loss": 0.85451281, "learning_rate": 3.576899857142152e-06, "loss": 0.87583244, "num_input_tokens_seen": 75821620, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.6640625, "step": 3509, "time_per_iteration": 2.334444999694824 }, { "auxiliary_loss_clip": 0.01095768, "auxiliary_loss_mlp": 0.0103721, "balance_loss_clip": 1.01792216, "balance_loss_mlp": 1.02884912, "epoch": 0.21103261686457236, "flos": 31501188086400.0, "grad_norm": 1.882526021084428, "language_loss": 0.68351012, "learning_rate": 3.5766674308913254e-06, "loss": 0.70483989, "num_input_tokens_seen": 75842490, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.671875, "step": 3510, "time_per_iteration": 2.4673638343811035 }, { "auxiliary_loss_clip": 0.01091588, "auxiliary_loss_mlp": 0.01032657, "balance_loss_clip": 1.01431036, "balance_loss_mlp": 1.02609229, "epoch": 0.21109274011724033, "flos": 27342857802240.0, "grad_norm": 1.6090744940540962, "language_loss": 0.71818495, "learning_rate": 3.5764349483732937e-06, "loss": 0.73942745, "num_input_tokens_seen": 75865985, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.65625, "step": 3511, "time_per_iteration": 2.44651460647583 }, { "auxiliary_loss_clip": 0.01095289, "auxiliary_loss_mlp": 0.01037677, "balance_loss_clip": 1.01693511, "balance_loss_mlp": 1.02731109, "epoch": 0.21115286336990832, "flos": 17819459400960.0, "grad_norm": 3.0959236714669123, "language_loss": 0.69297749, "learning_rate": 3.5762024095963543e-06, "loss": 0.71430719, "num_input_tokens_seen": 75882745, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.6796875, "step": 3512, "time_per_iteration": 2.3455917835235596 }, { "auxiliary_loss_clip": 0.01091973, "auxiliary_loss_mlp": 0.01039771, "balance_loss_clip": 1.01988673, "balance_loss_mlp": 1.02665246, "epoch": 0.2112129866225763, "flos": 27196781207040.0, "grad_norm": 1.918463867823304, "language_loss": 0.73346496, "learning_rate": 3.575969814568805e-06, "loss": 0.75478244, "num_input_tokens_seen": 75904305, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.65234375, "step": 3513, "time_per_iteration": 2.4369938373565674 }, { "auxiliary_loss_clip": 0.01089965, "auxiliary_loss_mlp": 0.01029703, "balance_loss_clip": 1.0124774, "balance_loss_mlp": 1.02838898, "epoch": 0.21127310987524425, "flos": 23730185157120.0, "grad_norm": 1.6947620457212758, "language_loss": 0.74049127, "learning_rate": 3.5757371632989477e-06, "loss": 0.76168793, "num_input_tokens_seen": 75923710, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.6171875, "step": 3514, "time_per_iteration": 2.3905675411224365 }, { "auxiliary_loss_clip": 0.01093489, "auxiliary_loss_mlp": 0.01036614, "balance_loss_clip": 1.01751673, "balance_loss_mlp": 1.0279429, "epoch": 0.21133323312791222, "flos": 18331530445440.0, "grad_norm": 2.141505591336286, "language_loss": 0.76685333, "learning_rate": 3.5755044557950832e-06, "loss": 0.7881543, "num_input_tokens_seen": 75942625, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.65625, "step": 3515, "time_per_iteration": 2.3575501441955566 }, { "auxiliary_loss_clip": 0.0109182, "auxiliary_loss_mlp": 0.01037405, "balance_loss_clip": 1.01941621, "balance_loss_mlp": 1.02857447, "epoch": 0.21139335638058018, "flos": 17930238744960.0, "grad_norm": 1.8311208917963162, "language_loss": 0.68553162, "learning_rate": 3.575271692065518e-06, "loss": 0.70682395, "num_input_tokens_seen": 75959930, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.6328125, "step": 3516, "time_per_iteration": 2.3643317222595215 }, { "auxiliary_loss_clip": 0.01095717, "auxiliary_loss_mlp": 0.01041849, "balance_loss_clip": 1.02270365, "balance_loss_mlp": 1.02930379, "epoch": 0.21145347963324815, "flos": 24570928540800.0, "grad_norm": 1.742982541473488, "language_loss": 0.85129178, "learning_rate": 3.575038872118558e-06, "loss": 0.87266737, "num_input_tokens_seen": 75980335, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.6640625, "step": 3517, "time_per_iteration": 2.4110751152038574 }, { "auxiliary_loss_clip": 0.01090777, "auxiliary_loss_mlp": 0.0103677, "balance_loss_clip": 1.01811337, "balance_loss_mlp": 1.02686214, "epoch": 0.21151360288591614, "flos": 35844488087040.0, "grad_norm": 1.8859616750495563, "language_loss": 0.62656885, "learning_rate": 3.5748059959625122e-06, "loss": 0.64784431, "num_input_tokens_seen": 76002095, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.640625, "step": 3518, "time_per_iteration": 2.496013641357422 }, { "auxiliary_loss_clip": 0.01091664, "auxiliary_loss_mlp": 0.01043294, "balance_loss_clip": 1.02456641, "balance_loss_mlp": 1.02831721, "epoch": 0.2115737261385841, "flos": 24640510613760.0, "grad_norm": 1.8366046838889278, "language_loss": 0.88661051, "learning_rate": 3.574573063605691e-06, "loss": 0.90796006, "num_input_tokens_seen": 76020425, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.6328125, "step": 3519, "time_per_iteration": 2.4008982181549072 }, { "auxiliary_loss_clip": 0.01095851, "auxiliary_loss_mlp": 0.01036825, "balance_loss_clip": 1.01750147, "balance_loss_mlp": 1.02927542, "epoch": 0.21163384939125207, "flos": 25225759601280.0, "grad_norm": 1.6634088581540472, "language_loss": 0.81199706, "learning_rate": 3.574340075056408e-06, "loss": 0.83332372, "num_input_tokens_seen": 76041210, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.6640625, "step": 3520, "time_per_iteration": 2.4060888290405273 }, { "auxiliary_loss_clip": 0.01088627, "auxiliary_loss_mlp": 0.01039287, "balance_loss_clip": 1.02148867, "balance_loss_mlp": 1.02626753, "epoch": 0.21169397264392004, "flos": 26066328428160.0, "grad_norm": 1.655151660929098, "language_loss": 0.75676954, "learning_rate": 3.5741070303229776e-06, "loss": 0.77804863, "num_input_tokens_seen": 76062685, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.625, "step": 3521, "time_per_iteration": 2.4211723804473877 }, { "auxiliary_loss_clip": 0.01094089, "auxiliary_loss_mlp": 0.01037113, "balance_loss_clip": 1.02080548, "balance_loss_mlp": 1.02887058, "epoch": 0.211754095896588, "flos": 23107264945920.0, "grad_norm": 3.0514413758967063, "language_loss": 0.75513554, "learning_rate": 3.5738739294137154e-06, "loss": 0.77644765, "num_input_tokens_seen": 76082300, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.65234375, "step": 3522, "time_per_iteration": 2.3824939727783203 }, { "auxiliary_loss_clip": 0.01090576, "auxiliary_loss_mlp": 0.01048629, "balance_loss_clip": 1.02953196, "balance_loss_mlp": 1.02632892, "epoch": 0.21181421914925597, "flos": 27921264163200.0, "grad_norm": 1.8193334486733563, "language_loss": 0.69856447, "learning_rate": 3.573640772336942e-06, "loss": 0.71995652, "num_input_tokens_seen": 76101135, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.64453125, "step": 3523, "time_per_iteration": 2.4629838466644287 }, { "auxiliary_loss_clip": 0.0109289, "auxiliary_loss_mlp": 0.01044684, "balance_loss_clip": 1.0258131, "balance_loss_mlp": 1.02810442, "epoch": 0.21187434240192393, "flos": 17127690255360.0, "grad_norm": 3.4908955838004045, "language_loss": 0.77000618, "learning_rate": 3.573407559100977e-06, "loss": 0.79138196, "num_input_tokens_seen": 76119320, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.6484375, "step": 3524, "time_per_iteration": 2.3608884811401367 }, { "auxiliary_loss_clip": 0.01089836, "auxiliary_loss_mlp": 0.01037284, "balance_loss_clip": 1.01856828, "balance_loss_mlp": 1.02484906, "epoch": 0.21193446565459192, "flos": 22346193018240.0, "grad_norm": 1.9693963647455388, "language_loss": 0.81461942, "learning_rate": 3.573174289714143e-06, "loss": 0.83589065, "num_input_tokens_seen": 76137445, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.6484375, "step": 3525, "time_per_iteration": 2.3843276500701904 }, { "auxiliary_loss_clip": 0.01092364, "auxiliary_loss_mlp": 0.0103408, "balance_loss_clip": 1.01537633, "balance_loss_mlp": 1.02833056, "epoch": 0.2119945889072599, "flos": 27198072927360.0, "grad_norm": 1.6840448047188865, "language_loss": 0.74895763, "learning_rate": 3.572940964184766e-06, "loss": 0.77022207, "num_input_tokens_seen": 76159500, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.640625, "step": 3526, "time_per_iteration": 2.4151835441589355 }, { "auxiliary_loss_clip": 0.01092074, "auxiliary_loss_mlp": 0.01032019, "balance_loss_clip": 1.01324344, "balance_loss_mlp": 1.02757716, "epoch": 0.21205471215992786, "flos": 20990934794880.0, "grad_norm": 1.6340175886380324, "language_loss": 0.77082324, "learning_rate": 3.572707582521172e-06, "loss": 0.79206413, "num_input_tokens_seen": 76177990, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.64453125, "step": 3527, "time_per_iteration": 2.389474630355835 }, { "auxiliary_loss_clip": 0.01090427, "auxiliary_loss_mlp": 0.01046113, "balance_loss_clip": 1.02573991, "balance_loss_mlp": 1.02565289, "epoch": 0.21211483541259582, "flos": 20776602758400.0, "grad_norm": 1.8858213112158622, "language_loss": 0.7840848, "learning_rate": 3.5724741447316894e-06, "loss": 0.8054502, "num_input_tokens_seen": 76197125, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.6484375, "step": 3528, "time_per_iteration": 2.3695602416992188 }, { "auxiliary_loss_clip": 0.01091783, "auxiliary_loss_mlp": 0.01037309, "balance_loss_clip": 1.01918912, "balance_loss_mlp": 1.02790213, "epoch": 0.21217495866526379, "flos": 18988979857920.0, "grad_norm": 1.857439679164292, "language_loss": 0.8140527, "learning_rate": 3.57224065082465e-06, "loss": 0.8353436, "num_input_tokens_seen": 76216215, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.640625, "step": 3529, "time_per_iteration": 2.3696351051330566 }, { "auxiliary_loss_clip": 0.01094116, "auxiliary_loss_mlp": 0.0104552, "balance_loss_clip": 1.02585042, "balance_loss_mlp": 1.02790391, "epoch": 0.21223508191793175, "flos": 20666277262080.0, "grad_norm": 2.0687359976770687, "language_loss": 0.7687794, "learning_rate": 3.572007100808386e-06, "loss": 0.7901758, "num_input_tokens_seen": 76237010, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.66015625, "step": 3530, "time_per_iteration": 3.769836187362671 }, { "auxiliary_loss_clip": 0.01089458, "auxiliary_loss_mlp": 0.01036236, "balance_loss_clip": 1.01879537, "balance_loss_mlp": 1.02676487, "epoch": 0.21229520517059972, "flos": 21615391105920.0, "grad_norm": 2.4408438047408825, "language_loss": 0.83416092, "learning_rate": 3.5717734946912323e-06, "loss": 0.85541785, "num_input_tokens_seen": 76255965, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.625, "step": 3531, "time_per_iteration": 2.370994806289673 }, { "auxiliary_loss_clip": 0.01096942, "auxiliary_loss_mlp": 0.01036436, "balance_loss_clip": 1.01590848, "balance_loss_mlp": 1.03025162, "epoch": 0.2123553284232677, "flos": 13990185480960.0, "grad_norm": 2.124867427867221, "language_loss": 0.73241019, "learning_rate": 3.5715398324815248e-06, "loss": 0.75374401, "num_input_tokens_seen": 76272150, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.66796875, "step": 3532, "time_per_iteration": 2.333064556121826 }, { "auxiliary_loss_clip": 0.01091105, "auxiliary_loss_mlp": 0.01040545, "balance_loss_clip": 1.02134037, "balance_loss_mlp": 1.02644968, "epoch": 0.21241545167593567, "flos": 18295779346560.0, "grad_norm": 1.5600115485066246, "language_loss": 0.73706496, "learning_rate": 3.5713061141876038e-06, "loss": 0.75838149, "num_input_tokens_seen": 76291425, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.6484375, "step": 3533, "time_per_iteration": 3.7267649173736572 }, { "auxiliary_loss_clip": 0.01092779, "auxiliary_loss_mlp": 0.01036701, "balance_loss_clip": 1.0176512, "balance_loss_mlp": 1.0267204, "epoch": 0.21247557492860364, "flos": 34711731158400.0, "grad_norm": 1.816658117639494, "language_loss": 0.71616459, "learning_rate": 3.57107233981781e-06, "loss": 0.73745942, "num_input_tokens_seen": 76313975, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.66015625, "step": 3534, "time_per_iteration": 3.8768603801727295 }, { "auxiliary_loss_clip": 0.01092431, "auxiliary_loss_mlp": 0.01035892, "balance_loss_clip": 1.01673508, "balance_loss_mlp": 1.02860451, "epoch": 0.2125356981812716, "flos": 22052748107520.0, "grad_norm": 1.7443495829552542, "language_loss": 0.71470159, "learning_rate": 3.570838509380485e-06, "loss": 0.7359848, "num_input_tokens_seen": 76330955, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.63671875, "step": 3535, "time_per_iteration": 2.3606069087982178 }, { "auxiliary_loss_clip": 0.01088114, "auxiliary_loss_mlp": 0.0104386, "balance_loss_clip": 1.02588296, "balance_loss_mlp": 1.02686036, "epoch": 0.21259582143393957, "flos": 28547082017280.0, "grad_norm": 2.676692349575711, "language_loss": 0.70630693, "learning_rate": 3.5706046228839744e-06, "loss": 0.72762668, "num_input_tokens_seen": 76352680, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.61328125, "step": 3536, "time_per_iteration": 3.8124794960021973 }, { "auxiliary_loss_clip": 0.01093179, "auxiliary_loss_mlp": 0.01040017, "balance_loss_clip": 1.0213486, "balance_loss_mlp": 1.02764964, "epoch": 0.21265594468660753, "flos": 20119851573120.0, "grad_norm": 1.8183940484573426, "language_loss": 0.88206851, "learning_rate": 3.5703706803366245e-06, "loss": 0.90340042, "num_input_tokens_seen": 76370750, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.65625, "step": 3537, "time_per_iteration": 2.3919172286987305 }, { "auxiliary_loss_clip": 0.01087442, "auxiliary_loss_mlp": 0.01034226, "balance_loss_clip": 1.01663041, "balance_loss_mlp": 1.02580369, "epoch": 0.21271606793927553, "flos": 23075039894400.0, "grad_norm": 1.7987711909369988, "language_loss": 0.80270451, "learning_rate": 3.5701366817467852e-06, "loss": 0.8239212, "num_input_tokens_seen": 76390610, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.6171875, "step": 3538, "time_per_iteration": 2.3892593383789062 }, { "auxiliary_loss_clip": 0.01089552, "auxiliary_loss_mlp": 0.01033069, "balance_loss_clip": 1.01600981, "balance_loss_mlp": 1.02698207, "epoch": 0.2127761911919435, "flos": 26387215534080.0, "grad_norm": 1.5432817295161552, "language_loss": 0.87025869, "learning_rate": 3.569902627122807e-06, "loss": 0.89148492, "num_input_tokens_seen": 76408860, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.625, "step": 3539, "time_per_iteration": 2.413520097732544 }, { "auxiliary_loss_clip": 0.01092517, "auxiliary_loss_mlp": 0.01035804, "balance_loss_clip": 1.01732671, "balance_loss_mlp": 1.02834845, "epoch": 0.21283631444461146, "flos": 20227279426560.0, "grad_norm": 1.9640439737483033, "language_loss": 0.58144236, "learning_rate": 3.5696685164730413e-06, "loss": 0.60272551, "num_input_tokens_seen": 76424980, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.640625, "step": 3540, "time_per_iteration": 2.396209478378296 }, { "auxiliary_loss_clip": 0.0109066, "auxiliary_loss_mlp": 0.0103758, "balance_loss_clip": 1.01773214, "balance_loss_mlp": 1.02585626, "epoch": 0.21289643769727942, "flos": 13516134773760.0, "grad_norm": 2.8270848855133885, "language_loss": 0.76063997, "learning_rate": 3.569434349805844e-06, "loss": 0.7819224, "num_input_tokens_seen": 76443135, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.6484375, "step": 3541, "time_per_iteration": 2.3696482181549072 }, { "auxiliary_loss_clip": 0.01088552, "auxiliary_loss_mlp": 0.01033831, "balance_loss_clip": 1.01646209, "balance_loss_mlp": 1.02636743, "epoch": 0.2129565609499474, "flos": 24825864355200.0, "grad_norm": 1.8189748043941079, "language_loss": 0.69292998, "learning_rate": 3.569200127129572e-06, "loss": 0.71415377, "num_input_tokens_seen": 76462470, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.62109375, "step": 3542, "time_per_iteration": 2.408999443054199 }, { "auxiliary_loss_clip": 0.01088057, "auxiliary_loss_mlp": 0.01037721, "balance_loss_clip": 1.02069831, "balance_loss_mlp": 1.02653039, "epoch": 0.21301668420261535, "flos": 23658124377600.0, "grad_norm": 1.9558757392083184, "language_loss": 0.76540411, "learning_rate": 3.568965848452584e-06, "loss": 0.78666192, "num_input_tokens_seen": 76481995, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.61328125, "step": 3543, "time_per_iteration": 2.38474178314209 }, { "auxiliary_loss_clip": 0.01090817, "auxiliary_loss_mlp": 0.01036364, "balance_loss_clip": 1.01883984, "balance_loss_mlp": 1.02940273, "epoch": 0.21307680745528332, "flos": 16361870382720.0, "grad_norm": 1.787852595170984, "language_loss": 0.66598499, "learning_rate": 3.568731513783241e-06, "loss": 0.68725681, "num_input_tokens_seen": 76500245, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.6171875, "step": 3544, "time_per_iteration": 2.3578274250030518 }, { "auxiliary_loss_clip": 0.01090795, "auxiliary_loss_mlp": 0.01035123, "balance_loss_clip": 1.01752782, "balance_loss_mlp": 1.02702022, "epoch": 0.2131369307079513, "flos": 19098048545280.0, "grad_norm": 1.7353996480229472, "language_loss": 0.71131104, "learning_rate": 3.568497123129905e-06, "loss": 0.73257023, "num_input_tokens_seen": 76519535, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.63671875, "step": 3545, "time_per_iteration": 2.364079713821411 }, { "auxiliary_loss_clip": 0.01092134, "auxiliary_loss_mlp": 0.01038229, "balance_loss_clip": 1.01905966, "balance_loss_mlp": 1.02641511, "epoch": 0.21319705396061928, "flos": 30370979687040.0, "grad_norm": 3.304056499257508, "language_loss": 0.72142816, "learning_rate": 3.568262676500942e-06, "loss": 0.74273175, "num_input_tokens_seen": 76542065, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.65625, "step": 3546, "time_per_iteration": 2.44181489944458 }, { "auxiliary_loss_clip": 0.01090506, "auxiliary_loss_mlp": 0.01039076, "balance_loss_clip": 1.02145648, "balance_loss_mlp": 1.02731371, "epoch": 0.21325717721328724, "flos": 21755288390400.0, "grad_norm": 2.2793165837832254, "language_loss": 0.80409074, "learning_rate": 3.568028173904717e-06, "loss": 0.82538652, "num_input_tokens_seen": 76560540, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.6328125, "step": 3547, "time_per_iteration": 2.367537260055542 }, { "auxiliary_loss_clip": 0.01092303, "auxiliary_loss_mlp": 0.01040354, "balance_loss_clip": 1.02100635, "balance_loss_mlp": 1.02711093, "epoch": 0.2133173004659552, "flos": 28729607938560.0, "grad_norm": 2.235681769906824, "language_loss": 0.74547142, "learning_rate": 3.567793615349601e-06, "loss": 0.76679802, "num_input_tokens_seen": 76581760, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.65234375, "step": 3548, "time_per_iteration": 2.429685354232788 }, { "auxiliary_loss_clip": 0.0109612, "auxiliary_loss_mlp": 0.01039787, "balance_loss_clip": 1.019665, "balance_loss_mlp": 1.02880847, "epoch": 0.21337742371862317, "flos": 16836130558080.0, "grad_norm": 1.8959408548935859, "language_loss": 0.74469119, "learning_rate": 3.567559000843963e-06, "loss": 0.76605028, "num_input_tokens_seen": 76599940, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.671875, "step": 3549, "time_per_iteration": 2.352445363998413 }, { "auxiliary_loss_clip": 0.0109448, "auxiliary_loss_mlp": 0.01034412, "balance_loss_clip": 1.01723397, "balance_loss_mlp": 1.02977777, "epoch": 0.21343754697129114, "flos": 24423804604800.0, "grad_norm": 1.6763507306574892, "language_loss": 0.8074863, "learning_rate": 3.567324330396177e-06, "loss": 0.82877523, "num_input_tokens_seen": 76619580, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.6484375, "step": 3550, "time_per_iteration": 2.4111640453338623 }, { "auxiliary_loss_clip": 0.01090743, "auxiliary_loss_mlp": 0.01033572, "balance_loss_clip": 1.01688302, "balance_loss_mlp": 1.02898681, "epoch": 0.21349767022395913, "flos": 19276908773760.0, "grad_norm": 1.6263183111974162, "language_loss": 0.87823749, "learning_rate": 3.5670896040146173e-06, "loss": 0.89948064, "num_input_tokens_seen": 76638195, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.6171875, "step": 3551, "time_per_iteration": 2.378389358520508 }, { "auxiliary_loss_clip": 0.01091534, "auxiliary_loss_mlp": 0.01032952, "balance_loss_clip": 1.01468873, "balance_loss_mlp": 1.02740884, "epoch": 0.2135577934766271, "flos": 17346595680000.0, "grad_norm": 1.9473713258331604, "language_loss": 0.83161962, "learning_rate": 3.5668548217076605e-06, "loss": 0.85286444, "num_input_tokens_seen": 76656695, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.640625, "step": 3552, "time_per_iteration": 2.3684539794921875 }, { "auxiliary_loss_clip": 0.0108993, "auxiliary_loss_mlp": 0.01038122, "balance_loss_clip": 1.02004933, "balance_loss_mlp": 1.02729726, "epoch": 0.21361791672929506, "flos": 24056169321600.0, "grad_norm": 1.6071835531330094, "language_loss": 0.76515716, "learning_rate": 3.5666199834836855e-06, "loss": 0.78643763, "num_input_tokens_seen": 76677430, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.625, "step": 3553, "time_per_iteration": 2.418617010116577 }, { "auxiliary_loss_clip": 0.01089905, "auxiliary_loss_mlp": 0.01038041, "balance_loss_clip": 1.02044606, "balance_loss_mlp": 1.02761078, "epoch": 0.21367803998196302, "flos": 22161258213120.0, "grad_norm": 1.560754988037051, "language_loss": 0.72691786, "learning_rate": 3.5663850893510734e-06, "loss": 0.74819732, "num_input_tokens_seen": 76697615, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.625, "step": 3554, "time_per_iteration": 2.400346517562866 }, { "auxiliary_loss_clip": 0.01090047, "auxiliary_loss_mlp": 0.01034065, "balance_loss_clip": 1.01655304, "balance_loss_mlp": 1.02673614, "epoch": 0.213738163234631, "flos": 20885811091200.0, "grad_norm": 2.035509144294839, "language_loss": 0.67685765, "learning_rate": 3.566150139318206e-06, "loss": 0.69809878, "num_input_tokens_seen": 76715685, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.6328125, "step": 3555, "time_per_iteration": 2.361581802368164 }, { "auxiliary_loss_clip": 0.01091967, "auxiliary_loss_mlp": 0.01036928, "balance_loss_clip": 1.01806927, "balance_loss_mlp": 1.0273869, "epoch": 0.21379828648729896, "flos": 28401843294720.0, "grad_norm": 1.8029032971913221, "language_loss": 0.64442456, "learning_rate": 3.56591513339347e-06, "loss": 0.66571343, "num_input_tokens_seen": 76735405, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.64453125, "step": 3556, "time_per_iteration": 2.4424357414245605 }, { "auxiliary_loss_clip": 0.01093633, "auxiliary_loss_mlp": 0.01040235, "balance_loss_clip": 1.02211511, "balance_loss_mlp": 1.02866197, "epoch": 0.21385840973996692, "flos": 25478600734080.0, "grad_norm": 1.699493702630759, "language_loss": 0.7273261, "learning_rate": 3.56568007158525e-06, "loss": 0.74866474, "num_input_tokens_seen": 76754395, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.6484375, "step": 3557, "time_per_iteration": 2.4168779850006104 }, { "auxiliary_loss_clip": 0.01093852, "auxiliary_loss_mlp": 0.01035784, "balance_loss_clip": 1.01659131, "balance_loss_mlp": 1.02738237, "epoch": 0.2139185329926349, "flos": 28073031310080.0, "grad_norm": 1.6753315684596148, "language_loss": 0.67222565, "learning_rate": 3.565444953901935e-06, "loss": 0.69352198, "num_input_tokens_seen": 76777210, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.6640625, "step": 3558, "time_per_iteration": 2.455333709716797 }, { "auxiliary_loss_clip": 0.01094301, "auxiliary_loss_mlp": 0.01039093, "balance_loss_clip": 1.02034104, "balance_loss_mlp": 1.0276885, "epoch": 0.21397865624530288, "flos": 19607710705920.0, "grad_norm": 1.8618981889895183, "language_loss": 0.79920673, "learning_rate": 3.5652097803519173e-06, "loss": 0.82054073, "num_input_tokens_seen": 76795830, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.6640625, "step": 3559, "time_per_iteration": 2.3532660007476807 }, { "auxiliary_loss_clip": 0.01088294, "auxiliary_loss_mlp": 0.01035535, "balance_loss_clip": 1.01770151, "balance_loss_mlp": 1.02630925, "epoch": 0.21403877949797084, "flos": 24680311430400.0, "grad_norm": 1.5941598822264722, "language_loss": 0.67690969, "learning_rate": 3.5649745509435887e-06, "loss": 0.69814801, "num_input_tokens_seen": 76814700, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.6171875, "step": 3560, "time_per_iteration": 2.426039695739746 }, { "auxiliary_loss_clip": 0.01093116, "auxiliary_loss_mlp": 0.01038342, "balance_loss_clip": 1.02019823, "balance_loss_mlp": 1.02826059, "epoch": 0.2140989027506388, "flos": 19860237636480.0, "grad_norm": 1.9546038125169252, "language_loss": 0.72802126, "learning_rate": 3.564739265685344e-06, "loss": 0.74933589, "num_input_tokens_seen": 76833400, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.6484375, "step": 3561, "time_per_iteration": 2.373037576675415 }, { "auxiliary_loss_clip": 0.01092301, "auxiliary_loss_mlp": 0.01037435, "balance_loss_clip": 1.01895809, "balance_loss_mlp": 1.02813137, "epoch": 0.21415902600330677, "flos": 19134323314560.0, "grad_norm": 2.091336605978353, "language_loss": 0.77274024, "learning_rate": 3.56450392458558e-06, "loss": 0.79403764, "num_input_tokens_seen": 76850645, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.640625, "step": 3562, "time_per_iteration": 2.371242046356201 }, { "auxiliary_loss_clip": 0.01092097, "auxiliary_loss_mlp": 0.0103462, "balance_loss_clip": 1.01640487, "balance_loss_mlp": 1.02860069, "epoch": 0.21421914925597474, "flos": 22271548798080.0, "grad_norm": 2.146029902485723, "language_loss": 0.84829164, "learning_rate": 3.564268527652695e-06, "loss": 0.86955887, "num_input_tokens_seen": 76870135, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.63671875, "step": 3563, "time_per_iteration": 2.3777878284454346 }, { "auxiliary_loss_clip": 0.01090524, "auxiliary_loss_mlp": 0.01030805, "balance_loss_clip": 1.01405644, "balance_loss_mlp": 1.02772045, "epoch": 0.2142792725086427, "flos": 33873710860800.0, "grad_norm": 1.4489453244027943, "language_loss": 0.76527488, "learning_rate": 3.5640330748950902e-06, "loss": 0.78648818, "num_input_tokens_seen": 76893905, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.62890625, "step": 3564, "time_per_iteration": 2.523754835128784 }, { "auxiliary_loss_clip": 0.01089924, "auxiliary_loss_mlp": 0.01034831, "balance_loss_clip": 1.01730728, "balance_loss_mlp": 1.02771103, "epoch": 0.2143393957613107, "flos": 19859329941120.0, "grad_norm": 1.750469628884493, "language_loss": 0.88693774, "learning_rate": 3.5637975663211677e-06, "loss": 0.9081853, "num_input_tokens_seen": 76914205, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.625, "step": 3565, "time_per_iteration": 2.399671792984009 }, { "auxiliary_loss_clip": 0.01021939, "auxiliary_loss_mlp": 0.01002098, "balance_loss_clip": 1.0002147, "balance_loss_mlp": 1.00620675, "epoch": 0.21439951901397866, "flos": 68526891936000.0, "grad_norm": 0.8405105559125486, "language_loss": 0.52237988, "learning_rate": 3.563562001939333e-06, "loss": 0.54262018, "num_input_tokens_seen": 76975650, "router_z_loss_clip": 0.01879883, "router_z_loss_mlp": 0.15722656, "step": 3566, "time_per_iteration": 2.9677493572235107 }, { "auxiliary_loss_clip": 0.01087065, "auxiliary_loss_mlp": 0.01034586, "balance_loss_clip": 1.01798034, "balance_loss_mlp": 1.02784598, "epoch": 0.21445964226664663, "flos": 19681970901120.0, "grad_norm": 10.254505098981035, "language_loss": 0.66883928, "learning_rate": 3.563326381757993e-06, "loss": 0.69005579, "num_input_tokens_seen": 76992615, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.59375, "step": 3567, "time_per_iteration": 2.350949287414551 }, { "auxiliary_loss_clip": 0.01088321, "auxiliary_loss_mlp": 0.01035676, "balance_loss_clip": 1.01858127, "balance_loss_mlp": 1.027457, "epoch": 0.2145197655193146, "flos": 31105796405760.0, "grad_norm": 1.6680941296164373, "language_loss": 0.74234676, "learning_rate": 3.563090705785555e-06, "loss": 0.76358676, "num_input_tokens_seen": 77017005, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.609375, "step": 3568, "time_per_iteration": 2.4811275005340576 }, { "auxiliary_loss_clip": 0.01092114, "auxiliary_loss_mlp": 0.01039167, "balance_loss_clip": 1.02103531, "balance_loss_mlp": 1.0285691, "epoch": 0.21457988877198256, "flos": 20119746839040.0, "grad_norm": 1.5262703727022713, "language_loss": 0.77591181, "learning_rate": 3.5628549740304307e-06, "loss": 0.79722464, "num_input_tokens_seen": 77034990, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.63671875, "step": 3569, "time_per_iteration": 2.3640496730804443 }, { "auxiliary_loss_clip": 0.01097877, "auxiliary_loss_mlp": 0.01042975, "balance_loss_clip": 1.02281713, "balance_loss_mlp": 1.02996242, "epoch": 0.21464001202465052, "flos": 18587059752960.0, "grad_norm": 2.594278766033549, "language_loss": 0.70476753, "learning_rate": 3.562619186501032e-06, "loss": 0.72617602, "num_input_tokens_seen": 77052610, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.6796875, "step": 3570, "time_per_iteration": 3.727426767349243 }, { "auxiliary_loss_clip": 0.01093941, "auxiliary_loss_mlp": 0.01036131, "balance_loss_clip": 1.01766574, "balance_loss_mlp": 1.02872419, "epoch": 0.21470013527731852, "flos": 21834087062400.0, "grad_norm": 2.066701985883769, "language_loss": 0.78749084, "learning_rate": 3.562383343205774e-06, "loss": 0.80879158, "num_input_tokens_seen": 77072475, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.65234375, "step": 3571, "time_per_iteration": 2.376452922821045 }, { "auxiliary_loss_clip": 0.01093271, "auxiliary_loss_mlp": 0.01034033, "balance_loss_clip": 1.01385117, "balance_loss_mlp": 1.02877557, "epoch": 0.21476025852998648, "flos": 17602229721600.0, "grad_norm": 2.0248957534966694, "language_loss": 0.82518691, "learning_rate": 3.5621474441530744e-06, "loss": 0.84645993, "num_input_tokens_seen": 77089930, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.64453125, "step": 3572, "time_per_iteration": 3.739330530166626 }, { "auxiliary_loss_clip": 0.01095537, "auxiliary_loss_mlp": 0.01037888, "balance_loss_clip": 1.01826668, "balance_loss_mlp": 1.02786791, "epoch": 0.21482038178265445, "flos": 24826946607360.0, "grad_norm": 4.239586664123104, "language_loss": 0.64667124, "learning_rate": 3.5619114893513508e-06, "loss": 0.66800553, "num_input_tokens_seen": 77108970, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.67578125, "step": 3573, "time_per_iteration": 3.7828338146209717 }, { "auxiliary_loss_clip": 0.01086014, "auxiliary_loss_mlp": 0.01036069, "balance_loss_clip": 1.01881957, "balance_loss_mlp": 1.02638626, "epoch": 0.2148805050353224, "flos": 23257111968000.0, "grad_norm": 1.954719978928461, "language_loss": 0.75047588, "learning_rate": 3.5616754788090235e-06, "loss": 0.77169669, "num_input_tokens_seen": 77126045, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.59375, "step": 3574, "time_per_iteration": 2.391507148742676 }, { "auxiliary_loss_clip": 0.01088668, "auxiliary_loss_mlp": 0.01033446, "balance_loss_clip": 1.01467037, "balance_loss_mlp": 1.0269568, "epoch": 0.21494062828799038, "flos": 21320130804480.0, "grad_norm": 1.8067005748642064, "language_loss": 0.71893322, "learning_rate": 3.561439412534515e-06, "loss": 0.74015439, "num_input_tokens_seen": 77144600, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.6171875, "step": 3575, "time_per_iteration": 2.3854336738586426 }, { "auxiliary_loss_clip": 0.01090527, "auxiliary_loss_mlp": 0.01033478, "balance_loss_clip": 1.01526237, "balance_loss_mlp": 1.02761424, "epoch": 0.21500075154065834, "flos": 18842344680960.0, "grad_norm": 1.7983627799214463, "language_loss": 0.68403685, "learning_rate": 3.561203290536251e-06, "loss": 0.70527697, "num_input_tokens_seen": 77162965, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.62890625, "step": 3576, "time_per_iteration": 3.8433103561401367 }, { "auxiliary_loss_clip": 0.01095384, "auxiliary_loss_mlp": 0.01034632, "balance_loss_clip": 1.01510596, "balance_loss_mlp": 1.02919865, "epoch": 0.2150608747933263, "flos": 18441018069120.0, "grad_norm": 1.767014805779442, "language_loss": 0.88839924, "learning_rate": 3.560967112822657e-06, "loss": 0.90969938, "num_input_tokens_seen": 77179960, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.6640625, "step": 3577, "time_per_iteration": 2.347475528717041 }, { "auxiliary_loss_clip": 0.01020887, "auxiliary_loss_mlp": 0.01001967, "balance_loss_clip": 0.99982095, "balance_loss_mlp": 1.00577354, "epoch": 0.2151209980459943, "flos": 66595042742400.0, "grad_norm": 0.8061405269424026, "language_loss": 0.56194675, "learning_rate": 3.5607308794021623e-06, "loss": 0.58217531, "num_input_tokens_seen": 77239500, "router_z_loss_clip": 0.02148438, "router_z_loss_mlp": 0.15136719, "step": 3578, "time_per_iteration": 2.9712166786193848 }, { "auxiliary_loss_clip": 0.01091219, "auxiliary_loss_mlp": 0.01036472, "balance_loss_clip": 1.01766121, "balance_loss_mlp": 1.02836454, "epoch": 0.21518112129866226, "flos": 21574926973440.0, "grad_norm": 1.6810222968183248, "language_loss": 0.88131016, "learning_rate": 3.5604945902831975e-06, "loss": 0.90258706, "num_input_tokens_seen": 77254680, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.62890625, "step": 3579, "time_per_iteration": 2.3581838607788086 }, { "auxiliary_loss_clip": 0.01093006, "auxiliary_loss_mlp": 0.01041605, "balance_loss_clip": 1.0219475, "balance_loss_mlp": 1.02872515, "epoch": 0.21524124455133023, "flos": 20046603807360.0, "grad_norm": 1.8056512339471995, "language_loss": 0.78003907, "learning_rate": 3.560258245474194e-06, "loss": 0.80138516, "num_input_tokens_seen": 77274060, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.640625, "step": 3580, "time_per_iteration": 2.4127047061920166 }, { "auxiliary_loss_clip": 0.01090745, "auxiliary_loss_mlp": 0.01038444, "balance_loss_clip": 1.02080107, "balance_loss_mlp": 1.02885795, "epoch": 0.2153013678039982, "flos": 23950661592960.0, "grad_norm": 1.9748070906757218, "language_loss": 0.7300638, "learning_rate": 3.5600218449835876e-06, "loss": 0.75135565, "num_input_tokens_seen": 77293255, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.6171875, "step": 3581, "time_per_iteration": 2.3917815685272217 }, { "auxiliary_loss_clip": 0.01090551, "auxiliary_loss_mlp": 0.01042521, "balance_loss_clip": 1.02257752, "balance_loss_mlp": 1.02849686, "epoch": 0.21536149105666616, "flos": 20593797546240.0, "grad_norm": 5.8618237838227305, "language_loss": 0.70694757, "learning_rate": 3.559785388819815e-06, "loss": 0.72827828, "num_input_tokens_seen": 77312390, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.62109375, "step": 3582, "time_per_iteration": 2.394432306289673 }, { "auxiliary_loss_clip": 0.01091108, "auxiliary_loss_mlp": 0.01037939, "balance_loss_clip": 1.01861525, "balance_loss_mlp": 1.02831674, "epoch": 0.21542161430933413, "flos": 12859209031680.0, "grad_norm": 2.2836442038875817, "language_loss": 0.83861291, "learning_rate": 3.5595488769913134e-06, "loss": 0.8599034, "num_input_tokens_seen": 77330985, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.62890625, "step": 3583, "time_per_iteration": 2.3503217697143555 }, { "auxiliary_loss_clip": 0.01095956, "auxiliary_loss_mlp": 0.01041814, "balance_loss_clip": 1.02240705, "balance_loss_mlp": 1.03027546, "epoch": 0.21548173756200212, "flos": 26102742842880.0, "grad_norm": 2.252417622498085, "language_loss": 0.83012831, "learning_rate": 3.5593123095065245e-06, "loss": 0.85150599, "num_input_tokens_seen": 77350770, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.65625, "step": 3584, "time_per_iteration": 2.413504123687744 }, { "auxiliary_loss_clip": 0.0109106, "auxiliary_loss_mlp": 0.01037681, "balance_loss_clip": 1.02002573, "balance_loss_mlp": 1.02791607, "epoch": 0.21554186081467008, "flos": 22162689578880.0, "grad_norm": 2.2177269774423825, "language_loss": 0.89685279, "learning_rate": 3.55907568637389e-06, "loss": 0.91814023, "num_input_tokens_seen": 77370510, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.6328125, "step": 3585, "time_per_iteration": 2.3835370540618896 }, { "auxiliary_loss_clip": 0.01092262, "auxiliary_loss_mlp": 0.01042329, "balance_loss_clip": 1.02404249, "balance_loss_mlp": 1.02910924, "epoch": 0.21560198406733805, "flos": 22965622093440.0, "grad_norm": 1.9904693913663805, "language_loss": 0.74803352, "learning_rate": 3.558839007601855e-06, "loss": 0.76937938, "num_input_tokens_seen": 77390645, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.62890625, "step": 3586, "time_per_iteration": 2.3789827823638916 }, { "auxiliary_loss_clip": 0.01091172, "auxiliary_loss_mlp": 0.01038639, "balance_loss_clip": 1.02122235, "balance_loss_mlp": 1.02772009, "epoch": 0.215662107320006, "flos": 22782956526720.0, "grad_norm": 1.892175981842573, "language_loss": 0.82819235, "learning_rate": 3.558602273198865e-06, "loss": 0.84949052, "num_input_tokens_seen": 77409655, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.6328125, "step": 3587, "time_per_iteration": 2.4005024433135986 }, { "auxiliary_loss_clip": 0.01092139, "auxiliary_loss_mlp": 0.01033209, "balance_loss_clip": 1.01403999, "balance_loss_mlp": 1.02778912, "epoch": 0.21572223057267398, "flos": 30882527061120.0, "grad_norm": 1.9281663194590999, "language_loss": 0.75907916, "learning_rate": 3.558365483173369e-06, "loss": 0.78033262, "num_input_tokens_seen": 77430560, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.64453125, "step": 3588, "time_per_iteration": 2.4512038230895996 }, { "auxiliary_loss_clip": 0.01089851, "auxiliary_loss_mlp": 0.0103441, "balance_loss_clip": 1.01693416, "balance_loss_mlp": 1.02671766, "epoch": 0.21578235382534194, "flos": 26909166493440.0, "grad_norm": 1.7400579268636616, "language_loss": 0.80721128, "learning_rate": 3.5581286375338183e-06, "loss": 0.8284539, "num_input_tokens_seen": 77455000, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.6328125, "step": 3589, "time_per_iteration": 2.464797019958496 }, { "auxiliary_loss_clip": 0.01092967, "auxiliary_loss_mlp": 0.0103363, "balance_loss_clip": 1.01548052, "balance_loss_mlp": 1.02881718, "epoch": 0.2158424770780099, "flos": 24424572654720.0, "grad_norm": 1.796896618802849, "language_loss": 0.72772634, "learning_rate": 3.557891736288664e-06, "loss": 0.74899232, "num_input_tokens_seen": 77475075, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.640625, "step": 3590, "time_per_iteration": 2.4086365699768066 }, { "auxiliary_loss_clip": 0.01094064, "auxiliary_loss_mlp": 0.01040865, "balance_loss_clip": 1.02068269, "balance_loss_mlp": 1.02712011, "epoch": 0.2159026003306779, "flos": 23948881113600.0, "grad_norm": 1.8203562472573895, "language_loss": 0.84164977, "learning_rate": 3.5576547794463608e-06, "loss": 0.86299908, "num_input_tokens_seen": 77495945, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.671875, "step": 3591, "time_per_iteration": 2.4037511348724365 }, { "auxiliary_loss_clip": 0.01097554, "auxiliary_loss_mlp": 0.01038275, "balance_loss_clip": 1.01684117, "balance_loss_mlp": 1.02815509, "epoch": 0.21596272358334587, "flos": 30039758818560.0, "grad_norm": 1.9655070881946446, "language_loss": 0.69344044, "learning_rate": 3.557417767015366e-06, "loss": 0.71479869, "num_input_tokens_seen": 77517140, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 0.6953125, "step": 3592, "time_per_iteration": 2.4410736560821533 }, { "auxiliary_loss_clip": 0.01094848, "auxiliary_loss_mlp": 0.01041074, "balance_loss_clip": 1.02173829, "balance_loss_mlp": 1.02889371, "epoch": 0.21602284683601383, "flos": 20375171412480.0, "grad_norm": 2.5800130780775246, "language_loss": 0.83571708, "learning_rate": 3.557180699004137e-06, "loss": 0.85707629, "num_input_tokens_seen": 77536085, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.66015625, "step": 3593, "time_per_iteration": 2.3666434288024902 }, { "auxiliary_loss_clip": 0.01093772, "auxiliary_loss_mlp": 0.01046233, "balance_loss_clip": 1.02636051, "balance_loss_mlp": 1.02722239, "epoch": 0.2160829700886818, "flos": 20776288556160.0, "grad_norm": 3.305035296664814, "language_loss": 0.75017703, "learning_rate": 3.556943575421134e-06, "loss": 0.77157712, "num_input_tokens_seen": 77553675, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.6640625, "step": 3594, "time_per_iteration": 2.364056348800659 }, { "auxiliary_loss_clip": 0.01090224, "auxiliary_loss_mlp": 0.01033744, "balance_loss_clip": 1.01548111, "balance_loss_mlp": 1.02737951, "epoch": 0.21614309334134976, "flos": 22308661440000.0, "grad_norm": 1.470275823256459, "language_loss": 0.80304974, "learning_rate": 3.55670639627482e-06, "loss": 0.82428944, "num_input_tokens_seen": 77573360, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.62890625, "step": 3595, "time_per_iteration": 2.4001059532165527 }, { "auxiliary_loss_clip": 0.01093998, "auxiliary_loss_mlp": 0.01035091, "balance_loss_clip": 1.01549327, "balance_loss_mlp": 1.02793932, "epoch": 0.21620321659401773, "flos": 19608513667200.0, "grad_norm": 1.8896780145521013, "language_loss": 0.78621411, "learning_rate": 3.556469161573659e-06, "loss": 0.80750501, "num_input_tokens_seen": 77591865, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.66015625, "step": 3596, "time_per_iteration": 2.349175214767456 }, { "auxiliary_loss_clip": 0.01088236, "auxiliary_loss_mlp": 0.01036131, "balance_loss_clip": 1.01828587, "balance_loss_mlp": 1.02725124, "epoch": 0.2162633398466857, "flos": 18843531667200.0, "grad_norm": 2.4123382452893813, "language_loss": 0.83061266, "learning_rate": 3.556231871326118e-06, "loss": 0.85185635, "num_input_tokens_seen": 77611600, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.609375, "step": 3597, "time_per_iteration": 2.3800954818725586 }, { "auxiliary_loss_clip": 0.01091466, "auxiliary_loss_mlp": 0.01035036, "balance_loss_clip": 1.01570058, "balance_loss_mlp": 1.02618265, "epoch": 0.21632346309935369, "flos": 18767875017600.0, "grad_norm": 1.568841190746847, "language_loss": 0.8058297, "learning_rate": 3.5559945255406635e-06, "loss": 0.82709467, "num_input_tokens_seen": 77630665, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.65625, "step": 3598, "time_per_iteration": 2.363445520401001 }, { "auxiliary_loss_clip": 0.01093682, "auxiliary_loss_mlp": 0.01040875, "balance_loss_clip": 1.02007306, "balance_loss_mlp": 1.02620852, "epoch": 0.21638358635202165, "flos": 26322939987840.0, "grad_norm": 1.781455726806936, "language_loss": 0.82309818, "learning_rate": 3.555757124225767e-06, "loss": 0.8444438, "num_input_tokens_seen": 77650835, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.67578125, "step": 3599, "time_per_iteration": 2.4402010440826416 }, { "auxiliary_loss_clip": 0.01088121, "auxiliary_loss_mlp": 0.01033317, "balance_loss_clip": 1.01438618, "balance_loss_mlp": 1.02644491, "epoch": 0.21644370960468962, "flos": 20739804318720.0, "grad_norm": 1.7064381165347018, "language_loss": 0.76435298, "learning_rate": 3.5555196673899015e-06, "loss": 0.7855674, "num_input_tokens_seen": 77669000, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.6171875, "step": 3600, "time_per_iteration": 2.372791290283203 }, { "auxiliary_loss_clip": 0.01091173, "auxiliary_loss_mlp": 0.01037773, "balance_loss_clip": 1.01993895, "balance_loss_mlp": 1.02560043, "epoch": 0.21650383285735758, "flos": 23951080529280.0, "grad_norm": 1.6802841912055213, "language_loss": 0.79665279, "learning_rate": 3.5552821550415396e-06, "loss": 0.81794226, "num_input_tokens_seen": 77688745, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.65625, "step": 3601, "time_per_iteration": 2.4099018573760986 }, { "auxiliary_loss_clip": 0.01091825, "auxiliary_loss_mlp": 0.01036843, "balance_loss_clip": 1.01902688, "balance_loss_mlp": 1.02843213, "epoch": 0.21656395611002555, "flos": 23694957728640.0, "grad_norm": 1.801246734435365, "language_loss": 0.83378541, "learning_rate": 3.5550445871891585e-06, "loss": 0.85507202, "num_input_tokens_seen": 77708445, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.6328125, "step": 3602, "time_per_iteration": 2.38606858253479 }, { "auxiliary_loss_clip": 0.0109169, "auxiliary_loss_mlp": 0.0103813, "balance_loss_clip": 1.01948524, "balance_loss_mlp": 1.02627182, "epoch": 0.2166240793626935, "flos": 20665055364480.0, "grad_norm": 2.1522657692338827, "language_loss": 0.7434755, "learning_rate": 3.554806963841236e-06, "loss": 0.76477373, "num_input_tokens_seen": 77728465, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.65625, "step": 3603, "time_per_iteration": 2.4077882766723633 }, { "auxiliary_loss_clip": 0.01088508, "auxiliary_loss_mlp": 0.01036431, "balance_loss_clip": 1.01874018, "balance_loss_mlp": 1.02596533, "epoch": 0.2166842026153615, "flos": 21579325804800.0, "grad_norm": 1.6006828308954126, "language_loss": 0.74189007, "learning_rate": 3.554569285006253e-06, "loss": 0.76313949, "num_input_tokens_seen": 77746735, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.625, "step": 3604, "time_per_iteration": 2.368547201156616 }, { "auxiliary_loss_clip": 0.01087886, "auxiliary_loss_mlp": 0.01032106, "balance_loss_clip": 1.0141654, "balance_loss_mlp": 1.02642488, "epoch": 0.21674432586802947, "flos": 25628761958400.0, "grad_norm": 1.6650597996806924, "language_loss": 0.79789186, "learning_rate": 3.5543315506926903e-06, "loss": 0.8190918, "num_input_tokens_seen": 77768105, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.6171875, "step": 3605, "time_per_iteration": 2.428480625152588 }, { "auxiliary_loss_clip": 0.01022334, "auxiliary_loss_mlp": 0.01005826, "balance_loss_clip": 1.00375164, "balance_loss_mlp": 1.00654793, "epoch": 0.21680444912069743, "flos": 56414893155840.0, "grad_norm": 0.6891858059380648, "language_loss": 0.5835098, "learning_rate": 3.5540937609090334e-06, "loss": 0.60379136, "num_input_tokens_seen": 77833750, "router_z_loss_clip": 0.02075195, "router_z_loss_mlp": 0.15820312, "step": 3606, "time_per_iteration": 3.1059229373931885 }, { "auxiliary_loss_clip": 0.01092309, "auxiliary_loss_mlp": 0.01040399, "balance_loss_clip": 1.02065825, "balance_loss_mlp": 1.02667499, "epoch": 0.2168645723733654, "flos": 23877797852160.0, "grad_norm": 2.1719810960899184, "language_loss": 0.73038226, "learning_rate": 3.5538559156637675e-06, "loss": 0.75170934, "num_input_tokens_seen": 77853780, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.65625, "step": 3607, "time_per_iteration": 2.390773057937622 }, { "auxiliary_loss_clip": 0.01094745, "auxiliary_loss_mlp": 0.01037115, "balance_loss_clip": 1.01831567, "balance_loss_mlp": 1.02730608, "epoch": 0.21692469562603336, "flos": 16945234156800.0, "grad_norm": 1.8680270875352738, "language_loss": 0.76685357, "learning_rate": 3.5536180149653805e-06, "loss": 0.78817213, "num_input_tokens_seen": 77872575, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.67578125, "step": 3608, "time_per_iteration": 2.34770131111145 }, { "auxiliary_loss_clip": 0.01093642, "auxiliary_loss_mlp": 0.01038162, "balance_loss_clip": 1.01961255, "balance_loss_mlp": 1.02752662, "epoch": 0.21698481887870133, "flos": 25117877900160.0, "grad_norm": 1.8803327586037324, "language_loss": 0.7461046, "learning_rate": 3.5533800588223636e-06, "loss": 0.76742268, "num_input_tokens_seen": 77892700, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.66015625, "step": 3609, "time_per_iteration": 3.786299467086792 }, { "auxiliary_loss_clip": 0.01094841, "auxiliary_loss_mlp": 0.01040215, "balance_loss_clip": 1.0212965, "balance_loss_mlp": 1.02876544, "epoch": 0.2170449421313693, "flos": 17893719596160.0, "grad_norm": 1.7049591031448108, "language_loss": 0.88620341, "learning_rate": 3.553142047243208e-06, "loss": 0.90755397, "num_input_tokens_seen": 77911060, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.6640625, "step": 3610, "time_per_iteration": 2.3434276580810547 }, { "auxiliary_loss_clip": 0.01091445, "auxiliary_loss_mlp": 0.01032203, "balance_loss_clip": 1.013201, "balance_loss_mlp": 1.02796221, "epoch": 0.2171050653840373, "flos": 22637333779200.0, "grad_norm": 1.7517505191453933, "language_loss": 0.7782172, "learning_rate": 3.5529039802364077e-06, "loss": 0.79945368, "num_input_tokens_seen": 77929930, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.6328125, "step": 3611, "time_per_iteration": 3.74212908744812 }, { "auxiliary_loss_clip": 0.01087637, "auxiliary_loss_mlp": 0.01036932, "balance_loss_clip": 1.01925349, "balance_loss_mlp": 1.0257417, "epoch": 0.21716518863670525, "flos": 19498991132160.0, "grad_norm": 3.32508520275354, "language_loss": 0.63251287, "learning_rate": 3.552665857810459e-06, "loss": 0.65375859, "num_input_tokens_seen": 77949060, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.6171875, "step": 3612, "time_per_iteration": 2.364633798599243 }, { "auxiliary_loss_clip": 0.01090932, "auxiliary_loss_mlp": 0.01035258, "balance_loss_clip": 1.01723361, "balance_loss_mlp": 1.02709413, "epoch": 0.21722531188937322, "flos": 19791004677120.0, "grad_norm": 2.148770153023427, "language_loss": 0.75538373, "learning_rate": 3.5524276799738594e-06, "loss": 0.77664566, "num_input_tokens_seen": 77967920, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.63671875, "step": 3613, "time_per_iteration": 3.8147830963134766 }, { "auxiliary_loss_clip": 0.010892, "auxiliary_loss_mlp": 0.01043653, "balance_loss_clip": 1.02454424, "balance_loss_mlp": 1.02751851, "epoch": 0.21728543514204118, "flos": 13333539029760.0, "grad_norm": 2.2336364813128204, "language_loss": 0.70556039, "learning_rate": 3.5521894467351095e-06, "loss": 0.7268889, "num_input_tokens_seen": 77985330, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.6171875, "step": 3614, "time_per_iteration": 2.370621919631958 }, { "auxiliary_loss_clip": 0.01091621, "auxiliary_loss_mlp": 0.01035006, "balance_loss_clip": 1.01693392, "balance_loss_mlp": 1.02753961, "epoch": 0.21734555839470915, "flos": 15230963756160.0, "grad_norm": 2.8856236911761113, "language_loss": 0.73475075, "learning_rate": 3.551951158102711e-06, "loss": 0.75601697, "num_input_tokens_seen": 78003105, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.640625, "step": 3615, "time_per_iteration": 2.3954524993896484 }, { "auxiliary_loss_clip": 0.01095758, "auxiliary_loss_mlp": 0.01038228, "balance_loss_clip": 1.01796234, "balance_loss_mlp": 1.02757883, "epoch": 0.2174056816473771, "flos": 19972972016640.0, "grad_norm": 2.0453382039960673, "language_loss": 0.89979649, "learning_rate": 3.5517128140851682e-06, "loss": 0.92113632, "num_input_tokens_seen": 78019655, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.68359375, "step": 3616, "time_per_iteration": 3.74100399017334 }, { "auxiliary_loss_clip": 0.01093413, "auxiliary_loss_mlp": 0.01036404, "balance_loss_clip": 1.01679385, "balance_loss_mlp": 1.02681398, "epoch": 0.21746580490004508, "flos": 16686458092800.0, "grad_norm": 2.8210824049104897, "language_loss": 0.81051397, "learning_rate": 3.551474414690986e-06, "loss": 0.83181214, "num_input_tokens_seen": 78036025, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.66796875, "step": 3617, "time_per_iteration": 2.3564059734344482 }, { "auxiliary_loss_clip": 0.01094452, "auxiliary_loss_mlp": 0.01040592, "balance_loss_clip": 1.02130365, "balance_loss_mlp": 1.0282445, "epoch": 0.21752592815271307, "flos": 25771207772160.0, "grad_norm": 1.8723426473039015, "language_loss": 0.75308621, "learning_rate": 3.551235959928673e-06, "loss": 0.77443665, "num_input_tokens_seen": 78055645, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.6640625, "step": 3618, "time_per_iteration": 2.418596029281616 }, { "auxiliary_loss_clip": 0.01090999, "auxiliary_loss_mlp": 0.01038202, "balance_loss_clip": 1.01904535, "balance_loss_mlp": 1.0256393, "epoch": 0.21758605140538104, "flos": 11253902584320.0, "grad_norm": 1.7939286684639173, "language_loss": 0.69565201, "learning_rate": 3.550997449806739e-06, "loss": 0.71694398, "num_input_tokens_seen": 78071660, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.65234375, "step": 3619, "time_per_iteration": 2.3391482830047607 }, { "auxiliary_loss_clip": 0.01096509, "auxiliary_loss_mlp": 0.01037495, "balance_loss_clip": 1.01752806, "balance_loss_mlp": 1.02994764, "epoch": 0.217646174658049, "flos": 19241681345280.0, "grad_norm": 2.7936134290692864, "language_loss": 0.78751981, "learning_rate": 3.5507588843336953e-06, "loss": 0.80885983, "num_input_tokens_seen": 78091265, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.6640625, "step": 3620, "time_per_iteration": 2.3868601322174072 }, { "auxiliary_loss_clip": 0.01087877, "auxiliary_loss_mlp": 0.01034422, "balance_loss_clip": 1.01588476, "balance_loss_mlp": 1.02686977, "epoch": 0.21770629791071697, "flos": 21943993622400.0, "grad_norm": 1.4737300606646127, "language_loss": 0.80021012, "learning_rate": 3.5505202635180556e-06, "loss": 0.82143313, "num_input_tokens_seen": 78110095, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.609375, "step": 3621, "time_per_iteration": 2.3791918754577637 }, { "auxiliary_loss_clip": 0.01088616, "auxiliary_loss_mlp": 0.01033982, "balance_loss_clip": 1.01675606, "balance_loss_mlp": 1.02592874, "epoch": 0.21776642116338493, "flos": 24935596358400.0, "grad_norm": 1.5861360912655296, "language_loss": 0.87613547, "learning_rate": 3.550281587368337e-06, "loss": 0.89736146, "num_input_tokens_seen": 78129475, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.625, "step": 3622, "time_per_iteration": 2.414642333984375 }, { "auxiliary_loss_clip": 0.01093598, "auxiliary_loss_mlp": 0.01037296, "balance_loss_clip": 1.01670909, "balance_loss_mlp": 1.02774429, "epoch": 0.2178265444160529, "flos": 17820367096320.0, "grad_norm": 2.1261771538098895, "language_loss": 0.77160025, "learning_rate": 3.550042855893056e-06, "loss": 0.79290915, "num_input_tokens_seen": 78146880, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.66015625, "step": 3623, "time_per_iteration": 2.3404550552368164 }, { "auxiliary_loss_clip": 0.01095072, "auxiliary_loss_mlp": 0.01045093, "balance_loss_clip": 1.0246253, "balance_loss_mlp": 1.02859235, "epoch": 0.2178866676687209, "flos": 17711926813440.0, "grad_norm": 1.852630681807572, "language_loss": 0.8442961, "learning_rate": 3.549804069100733e-06, "loss": 0.86569786, "num_input_tokens_seen": 78165065, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.6640625, "step": 3624, "time_per_iteration": 2.369171142578125 }, { "auxiliary_loss_clip": 0.01097251, "auxiliary_loss_mlp": 0.01039975, "balance_loss_clip": 1.02087736, "balance_loss_mlp": 1.03009987, "epoch": 0.21794679092138886, "flos": 16944919954560.0, "grad_norm": 2.420979482155318, "language_loss": 0.77038062, "learning_rate": 3.5495652269998887e-06, "loss": 0.79175287, "num_input_tokens_seen": 78180005, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.671875, "step": 3625, "time_per_iteration": 2.350097417831421 }, { "auxiliary_loss_clip": 0.01019424, "auxiliary_loss_mlp": 0.01003061, "balance_loss_clip": 1.00118947, "balance_loss_mlp": 1.00478375, "epoch": 0.21800691417405682, "flos": 63715371425280.0, "grad_norm": 0.8079825791042722, "language_loss": 0.60650551, "learning_rate": 3.549326329599048e-06, "loss": 0.62673038, "num_input_tokens_seen": 78245350, "router_z_loss_clip": 0.01867676, "router_z_loss_mlp": 0.14648438, "step": 3626, "time_per_iteration": 3.112251043319702 }, { "auxiliary_loss_clip": 0.01094413, "auxiliary_loss_mlp": 0.01038838, "balance_loss_clip": 1.01894259, "balance_loss_mlp": 1.02658427, "epoch": 0.21806703742672479, "flos": 21615321283200.0, "grad_norm": 1.8550901630318493, "language_loss": 0.90389788, "learning_rate": 3.549087376906736e-06, "loss": 0.92523026, "num_input_tokens_seen": 78264165, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.6796875, "step": 3627, "time_per_iteration": 2.389331102371216 }, { "auxiliary_loss_clip": 0.01091615, "auxiliary_loss_mlp": 0.01036037, "balance_loss_clip": 1.01701105, "balance_loss_mlp": 1.02693403, "epoch": 0.21812716067939275, "flos": 19353857143680.0, "grad_norm": 1.645331613507943, "language_loss": 0.73366785, "learning_rate": 3.5488483689314795e-06, "loss": 0.75494438, "num_input_tokens_seen": 78283745, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.6484375, "step": 3628, "time_per_iteration": 2.3655855655670166 }, { "auxiliary_loss_clip": 0.01088537, "auxiliary_loss_mlp": 0.01039362, "balance_loss_clip": 1.02062225, "balance_loss_mlp": 1.0253588, "epoch": 0.21818728393206072, "flos": 23546995920000.0, "grad_norm": 2.1342552613748897, "language_loss": 0.77395225, "learning_rate": 3.5486093056818094e-06, "loss": 0.79523122, "num_input_tokens_seen": 78302900, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.6328125, "step": 3629, "time_per_iteration": 2.394785165786743 }, { "auxiliary_loss_clip": 0.01092003, "auxiliary_loss_mlp": 0.01034264, "balance_loss_clip": 1.01708627, "balance_loss_mlp": 1.0276897, "epoch": 0.21824740718472868, "flos": 30224379421440.0, "grad_norm": 1.687506484159848, "language_loss": 0.7134552, "learning_rate": 3.5483701871662566e-06, "loss": 0.73471785, "num_input_tokens_seen": 78326470, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.640625, "step": 3630, "time_per_iteration": 2.464221477508545 }, { "auxiliary_loss_clip": 0.01085825, "auxiliary_loss_mlp": 0.01034814, "balance_loss_clip": 1.01738513, "balance_loss_mlp": 1.02591598, "epoch": 0.21830753043739667, "flos": 26133641262720.0, "grad_norm": 1.6603689984438528, "language_loss": 0.76497871, "learning_rate": 3.5481310133933546e-06, "loss": 0.78618515, "num_input_tokens_seen": 78345810, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.59765625, "step": 3631, "time_per_iteration": 2.4137797355651855 }, { "auxiliary_loss_clip": 0.01090745, "auxiliary_loss_mlp": 0.01032561, "balance_loss_clip": 1.01465559, "balance_loss_mlp": 1.02812052, "epoch": 0.21836765369006464, "flos": 21719781671040.0, "grad_norm": 2.53121131401874, "language_loss": 0.75330448, "learning_rate": 3.547891784371639e-06, "loss": 0.77453762, "num_input_tokens_seen": 78364085, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.625, "step": 3632, "time_per_iteration": 2.3723511695861816 }, { "auxiliary_loss_clip": 0.01088971, "auxiliary_loss_mlp": 0.01031218, "balance_loss_clip": 1.0143615, "balance_loss_mlp": 1.02587771, "epoch": 0.2184277769427326, "flos": 19936592513280.0, "grad_norm": 3.415645181158411, "language_loss": 0.84082043, "learning_rate": 3.547652500109647e-06, "loss": 0.8620224, "num_input_tokens_seen": 78381385, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.6328125, "step": 3633, "time_per_iteration": 2.364938259124756 }, { "auxiliary_loss_clip": 0.01089758, "auxiliary_loss_mlp": 0.01043776, "balance_loss_clip": 1.0248574, "balance_loss_mlp": 1.02721274, "epoch": 0.21848790019540057, "flos": 20339175934080.0, "grad_norm": 1.5651391173505917, "language_loss": 0.81594479, "learning_rate": 3.547413160615919e-06, "loss": 0.83728015, "num_input_tokens_seen": 78400500, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.625, "step": 3634, "time_per_iteration": 2.4022185802459717 }, { "auxiliary_loss_clip": 0.01093251, "auxiliary_loss_mlp": 0.01032686, "balance_loss_clip": 1.01472163, "balance_loss_mlp": 1.02807343, "epoch": 0.21854802344806853, "flos": 15449904092160.0, "grad_norm": 1.8847199864380413, "language_loss": 0.74939668, "learning_rate": 3.5471737658989956e-06, "loss": 0.77065599, "num_input_tokens_seen": 78418340, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.65234375, "step": 3635, "time_per_iteration": 2.3405630588531494 }, { "auxiliary_loss_clip": 0.01090406, "auxiliary_loss_mlp": 0.01038356, "balance_loss_clip": 1.02033162, "balance_loss_mlp": 1.02717638, "epoch": 0.2186081467007365, "flos": 16319939973120.0, "grad_norm": 1.8491380683041085, "language_loss": 0.87294209, "learning_rate": 3.54693431596742e-06, "loss": 0.89422971, "num_input_tokens_seen": 78434375, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.6328125, "step": 3636, "time_per_iteration": 2.358074188232422 }, { "auxiliary_loss_clip": 0.01090906, "auxiliary_loss_mlp": 0.01039013, "balance_loss_clip": 1.02036333, "balance_loss_mlp": 1.02711272, "epoch": 0.2186682699534045, "flos": 21688185024000.0, "grad_norm": 2.011595711849557, "language_loss": 0.75966811, "learning_rate": 3.5466948108297377e-06, "loss": 0.78096724, "num_input_tokens_seen": 78451735, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.63671875, "step": 3637, "time_per_iteration": 2.3807408809661865 }, { "auxiliary_loss_clip": 0.01093942, "auxiliary_loss_mlp": 0.01035223, "balance_loss_clip": 1.01469493, "balance_loss_mlp": 1.02733397, "epoch": 0.21872839320607246, "flos": 17738566047360.0, "grad_norm": 2.6371077640017346, "language_loss": 0.89282644, "learning_rate": 3.5464552504944965e-06, "loss": 0.91411805, "num_input_tokens_seen": 78462730, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.6640625, "step": 3638, "time_per_iteration": 2.3330798149108887 }, { "auxiliary_loss_clip": 0.01091267, "auxiliary_loss_mlp": 0.01042488, "balance_loss_clip": 1.02266312, "balance_loss_mlp": 1.02771962, "epoch": 0.21878851645874042, "flos": 18651544767360.0, "grad_norm": 2.216887211390042, "language_loss": 0.89628458, "learning_rate": 3.546215634970245e-06, "loss": 0.91762209, "num_input_tokens_seen": 78476300, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.6328125, "step": 3639, "time_per_iteration": 2.320844888687134 }, { "auxiliary_loss_clip": 0.01089655, "auxiliary_loss_mlp": 0.01035217, "balance_loss_clip": 1.01769292, "balance_loss_mlp": 1.02573586, "epoch": 0.2188486397114084, "flos": 25556107685760.0, "grad_norm": 1.9554778272301963, "language_loss": 0.79085922, "learning_rate": 3.545975964265535e-06, "loss": 0.81210792, "num_input_tokens_seen": 78496135, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.640625, "step": 3640, "time_per_iteration": 2.4163002967834473 }, { "auxiliary_loss_clip": 0.01094824, "auxiliary_loss_mlp": 0.01044022, "balance_loss_clip": 1.02319622, "balance_loss_mlp": 1.03164196, "epoch": 0.21890876296407635, "flos": 17891171066880.0, "grad_norm": 2.3250227569417903, "language_loss": 0.72241938, "learning_rate": 3.5457362383889196e-06, "loss": 0.74380779, "num_input_tokens_seen": 78513855, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.6328125, "step": 3641, "time_per_iteration": 2.3454127311706543 }, { "auxiliary_loss_clip": 0.01092003, "auxiliary_loss_mlp": 0.01038669, "balance_loss_clip": 1.02020335, "balance_loss_mlp": 1.02923465, "epoch": 0.21896888621674432, "flos": 17748131760000.0, "grad_norm": 1.9020619069361877, "language_loss": 0.81124008, "learning_rate": 3.5454964573489542e-06, "loss": 0.83254683, "num_input_tokens_seen": 78531740, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.62890625, "step": 3642, "time_per_iteration": 2.3891549110412598 }, { "auxiliary_loss_clip": 0.01094523, "auxiliary_loss_mlp": 0.01041683, "balance_loss_clip": 1.02157211, "balance_loss_mlp": 1.02773678, "epoch": 0.21902900946941228, "flos": 23075039894400.0, "grad_norm": 1.67525366925596, "language_loss": 0.71565598, "learning_rate": 3.545256621154196e-06, "loss": 0.73701805, "num_input_tokens_seen": 78549600, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.66796875, "step": 3643, "time_per_iteration": 2.394054651260376 }, { "auxiliary_loss_clip": 0.01095848, "auxiliary_loss_mlp": 0.01042595, "balance_loss_clip": 1.02365232, "balance_loss_mlp": 1.0290705, "epoch": 0.21908913272208028, "flos": 48176718923520.0, "grad_norm": 2.473822385752144, "language_loss": 0.68169093, "learning_rate": 3.545016729813203e-06, "loss": 0.70307541, "num_input_tokens_seen": 78573350, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.66796875, "step": 3644, "time_per_iteration": 2.6351137161254883 }, { "auxiliary_loss_clip": 0.01094533, "auxiliary_loss_mlp": 0.01035764, "balance_loss_clip": 1.01649952, "balance_loss_mlp": 1.02735949, "epoch": 0.21914925597474824, "flos": 22235658053760.0, "grad_norm": 2.4144990113309537, "language_loss": 0.77770472, "learning_rate": 3.544776783334538e-06, "loss": 0.79900765, "num_input_tokens_seen": 78591005, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.671875, "step": 3645, "time_per_iteration": 2.394505262374878 }, { "auxiliary_loss_clip": 0.01092502, "auxiliary_loss_mlp": 0.01037682, "balance_loss_clip": 1.01994359, "balance_loss_mlp": 1.02877426, "epoch": 0.2192093792274162, "flos": 22124564507520.0, "grad_norm": 1.5917323717771417, "language_loss": 0.82426739, "learning_rate": 3.5445367817267623e-06, "loss": 0.84556925, "num_input_tokens_seen": 78610645, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.63671875, "step": 3646, "time_per_iteration": 2.38145112991333 }, { "auxiliary_loss_clip": 0.01089747, "auxiliary_loss_mlp": 0.01031535, "balance_loss_clip": 1.01435649, "balance_loss_mlp": 1.02789617, "epoch": 0.21926950248008417, "flos": 15668530225920.0, "grad_norm": 1.707471653178866, "language_loss": 0.82878518, "learning_rate": 3.5442967249984427e-06, "loss": 0.84999806, "num_input_tokens_seen": 78628340, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.6171875, "step": 3647, "time_per_iteration": 2.3597939014434814 }, { "auxiliary_loss_clip": 0.0108968, "auxiliary_loss_mlp": 0.01042894, "balance_loss_clip": 1.02485752, "balance_loss_mlp": 1.02583981, "epoch": 0.21932962573275214, "flos": 30261212772480.0, "grad_norm": 1.6655901629453167, "language_loss": 0.72428632, "learning_rate": 3.544056613158145e-06, "loss": 0.74561208, "num_input_tokens_seen": 78649355, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.63671875, "step": 3648, "time_per_iteration": 3.8688979148864746 }, { "auxiliary_loss_clip": 0.01092046, "auxiliary_loss_mlp": 0.01037342, "balance_loss_clip": 1.01795888, "balance_loss_mlp": 1.02543092, "epoch": 0.2193897489854201, "flos": 10779363118080.0, "grad_norm": 2.509063598410687, "language_loss": 0.74706012, "learning_rate": 3.5438164462144383e-06, "loss": 0.768354, "num_input_tokens_seen": 78664915, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.66796875, "step": 3649, "time_per_iteration": 2.347658157348633 }, { "auxiliary_loss_clip": 0.01085887, "auxiliary_loss_mlp": 0.01033565, "balance_loss_clip": 1.01688766, "balance_loss_mlp": 1.02621293, "epoch": 0.21944987223808807, "flos": 19132368278400.0, "grad_norm": 3.118793264486232, "language_loss": 0.86427206, "learning_rate": 3.5435762241758944e-06, "loss": 0.88546658, "num_input_tokens_seen": 78681475, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.59765625, "step": 3650, "time_per_iteration": 2.349252700805664 }, { "auxiliary_loss_clip": 0.01089468, "auxiliary_loss_mlp": 0.01036268, "balance_loss_clip": 1.01769543, "balance_loss_mlp": 1.02543175, "epoch": 0.21950999549075606, "flos": 22709988051840.0, "grad_norm": 2.10465932582765, "language_loss": 0.83604038, "learning_rate": 3.5433359470510855e-06, "loss": 0.85729772, "num_input_tokens_seen": 78702300, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.640625, "step": 3651, "time_per_iteration": 3.7655768394470215 }, { "auxiliary_loss_clip": 0.01088492, "auxiliary_loss_mlp": 0.01040807, "balance_loss_clip": 1.02285457, "balance_loss_mlp": 1.0253855, "epoch": 0.21957011874342403, "flos": 10560562427520.0, "grad_norm": 1.7186149489782925, "language_loss": 0.74512058, "learning_rate": 3.5430956148485864e-06, "loss": 0.76641357, "num_input_tokens_seen": 78720230, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.62890625, "step": 3652, "time_per_iteration": 3.7480454444885254 }, { "auxiliary_loss_clip": 0.01020375, "auxiliary_loss_mlp": 0.0100853, "balance_loss_clip": 1.00630081, "balance_loss_mlp": 1.00518227, "epoch": 0.219630241996092, "flos": 65745047848320.0, "grad_norm": 0.7414611093971468, "language_loss": 0.51583755, "learning_rate": 3.542855227576974e-06, "loss": 0.53612655, "num_input_tokens_seen": 78780200, "router_z_loss_clip": 0.02233887, "router_z_loss_mlp": 0.15234375, "step": 3653, "time_per_iteration": 3.0350093841552734 }, { "auxiliary_loss_clip": 0.01093367, "auxiliary_loss_mlp": 0.01038173, "balance_loss_clip": 1.01992249, "balance_loss_mlp": 1.02969611, "epoch": 0.21969036524875996, "flos": 23695376664960.0, "grad_norm": 1.9980404981099662, "language_loss": 0.75457841, "learning_rate": 3.5426147852448276e-06, "loss": 0.77589381, "num_input_tokens_seen": 78800575, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.63671875, "step": 3654, "time_per_iteration": 2.3962314128875732 }, { "auxiliary_loss_clip": 0.01095718, "auxiliary_loss_mlp": 0.01043015, "balance_loss_clip": 1.02336919, "balance_loss_mlp": 1.02950132, "epoch": 0.21975048850142792, "flos": 19640040491520.0, "grad_norm": 1.88640288741463, "language_loss": 0.724374, "learning_rate": 3.542374287860727e-06, "loss": 0.74576128, "num_input_tokens_seen": 78819585, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.66015625, "step": 3655, "time_per_iteration": 3.762683391571045 }, { "auxiliary_loss_clip": 0.01093002, "auxiliary_loss_mlp": 0.0104213, "balance_loss_clip": 1.02427268, "balance_loss_mlp": 1.02878046, "epoch": 0.21981061175409589, "flos": 22447651029120.0, "grad_norm": 1.5176434959899086, "language_loss": 0.80999374, "learning_rate": 3.542133735433256e-06, "loss": 0.83134508, "num_input_tokens_seen": 78837330, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.640625, "step": 3656, "time_per_iteration": 2.3991000652313232 }, { "auxiliary_loss_clip": 0.01093215, "auxiliary_loss_mlp": 0.0103585, "balance_loss_clip": 1.0165143, "balance_loss_mlp": 1.02960467, "epoch": 0.21987073500676388, "flos": 18150051864960.0, "grad_norm": 2.1322825570433572, "language_loss": 0.84672594, "learning_rate": 3.541893127970999e-06, "loss": 0.8680166, "num_input_tokens_seen": 78854955, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.63671875, "step": 3657, "time_per_iteration": 2.3575732707977295 }, { "auxiliary_loss_clip": 0.01091715, "auxiliary_loss_mlp": 0.01032139, "balance_loss_clip": 1.01344705, "balance_loss_mlp": 1.02620268, "epoch": 0.21993085825943184, "flos": 25625096265600.0, "grad_norm": 1.613644961719573, "language_loss": 0.8030948, "learning_rate": 3.541652465482542e-06, "loss": 0.82433337, "num_input_tokens_seen": 78874965, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.65625, "step": 3658, "time_per_iteration": 2.4104864597320557 }, { "auxiliary_loss_clip": 0.0101832, "auxiliary_loss_mlp": 0.01001977, "balance_loss_clip": 0.99959266, "balance_loss_mlp": 1.00357008, "epoch": 0.2199909815120998, "flos": 70919349096960.0, "grad_norm": 0.7926029416663449, "language_loss": 0.58215219, "learning_rate": 3.5414117479764744e-06, "loss": 0.60235518, "num_input_tokens_seen": 78937740, "router_z_loss_clip": 0.02380371, "router_z_loss_mlp": 0.14746094, "step": 3659, "time_per_iteration": 3.1130309104919434 }, { "auxiliary_loss_clip": 0.01092748, "auxiliary_loss_mlp": 0.01031998, "balance_loss_clip": 1.0131036, "balance_loss_mlp": 1.02801776, "epoch": 0.22005110476476777, "flos": 21542457542400.0, "grad_norm": 4.135589391861042, "language_loss": 0.74065894, "learning_rate": 3.5411709754613864e-06, "loss": 0.76190639, "num_input_tokens_seen": 78955055, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.6484375, "step": 3660, "time_per_iteration": 2.3905675411224365 }, { "auxiliary_loss_clip": 0.01092462, "auxiliary_loss_mlp": 0.0103411, "balance_loss_clip": 1.01503634, "balance_loss_mlp": 1.02795982, "epoch": 0.22011122801743574, "flos": 22053411423360.0, "grad_norm": 1.6393205509940065, "language_loss": 0.81110561, "learning_rate": 3.5409301479458707e-06, "loss": 0.83237135, "num_input_tokens_seen": 78974895, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.64453125, "step": 3661, "time_per_iteration": 2.3919174671173096 }, { "auxiliary_loss_clip": 0.01093048, "auxiliary_loss_mlp": 0.01038529, "balance_loss_clip": 1.02069509, "balance_loss_mlp": 1.02945328, "epoch": 0.2201713512701037, "flos": 26686385907840.0, "grad_norm": 1.6930319666421592, "language_loss": 0.73479098, "learning_rate": 3.5406892654385223e-06, "loss": 0.75610673, "num_input_tokens_seen": 78994990, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.63671875, "step": 3662, "time_per_iteration": 2.4425196647644043 }, { "auxiliary_loss_clip": 0.01091396, "auxiliary_loss_mlp": 0.01037748, "balance_loss_clip": 1.02082038, "balance_loss_mlp": 1.02982497, "epoch": 0.22023147452277167, "flos": 22161153479040.0, "grad_norm": 1.4448901819100592, "language_loss": 0.78305918, "learning_rate": 3.540448327947936e-06, "loss": 0.80435061, "num_input_tokens_seen": 79014405, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.61328125, "step": 3663, "time_per_iteration": 2.3859312534332275 }, { "auxiliary_loss_clip": 0.0109561, "auxiliary_loss_mlp": 0.01036633, "balance_loss_clip": 1.01720226, "balance_loss_mlp": 1.02975941, "epoch": 0.22029159777543966, "flos": 22522330160640.0, "grad_norm": 2.53099107067621, "language_loss": 0.80450189, "learning_rate": 3.5402073354827123e-06, "loss": 0.82582432, "num_input_tokens_seen": 79032375, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.66015625, "step": 3664, "time_per_iteration": 2.3736634254455566 }, { "auxiliary_loss_clip": 0.01095337, "auxiliary_loss_mlp": 0.01041624, "balance_loss_clip": 1.02133501, "balance_loss_mlp": 1.02853584, "epoch": 0.22035172102810763, "flos": 13041630218880.0, "grad_norm": 2.928973983608562, "language_loss": 0.76736879, "learning_rate": 3.5399662880514497e-06, "loss": 0.78873837, "num_input_tokens_seen": 79049635, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.66796875, "step": 3665, "time_per_iteration": 2.37835693359375 }, { "auxiliary_loss_clip": 0.01090654, "auxiliary_loss_mlp": 0.01042395, "balance_loss_clip": 1.02353609, "balance_loss_mlp": 1.02657104, "epoch": 0.2204118442807756, "flos": 12165031002240.0, "grad_norm": 2.601770157104923, "language_loss": 0.98128355, "learning_rate": 3.5397251856627524e-06, "loss": 1.00261414, "num_input_tokens_seen": 79062890, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.640625, "step": 3666, "time_per_iteration": 2.326894521713257 }, { "auxiliary_loss_clip": 0.01092503, "auxiliary_loss_mlp": 0.01038429, "balance_loss_clip": 1.02005911, "balance_loss_mlp": 1.02742922, "epoch": 0.22047196753344356, "flos": 40107383493120.0, "grad_norm": 1.8165924049231157, "language_loss": 0.80317688, "learning_rate": 3.5394840283252236e-06, "loss": 0.8244862, "num_input_tokens_seen": 79085495, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.6484375, "step": 3667, "time_per_iteration": 2.5395076274871826 }, { "auxiliary_loss_clip": 0.01094245, "auxiliary_loss_mlp": 0.01040842, "balance_loss_clip": 1.02099347, "balance_loss_mlp": 1.0274899, "epoch": 0.22053209078611152, "flos": 20700178058880.0, "grad_norm": 1.7430325284434647, "language_loss": 0.77049088, "learning_rate": 3.53924281604747e-06, "loss": 0.79184175, "num_input_tokens_seen": 79101820, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.66796875, "step": 3668, "time_per_iteration": 2.359900951385498 }, { "auxiliary_loss_clip": 0.010929, "auxiliary_loss_mlp": 0.01040206, "balance_loss_clip": 1.02078676, "balance_loss_mlp": 1.02938235, "epoch": 0.2205922140387795, "flos": 24715189745280.0, "grad_norm": 1.595331213977969, "language_loss": 0.71167451, "learning_rate": 3.5390015488381e-06, "loss": 0.73300552, "num_input_tokens_seen": 79123320, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.63671875, "step": 3669, "time_per_iteration": 2.455019235610962 }, { "auxiliary_loss_clip": 0.01091524, "auxiliary_loss_mlp": 0.01032673, "balance_loss_clip": 1.01417136, "balance_loss_mlp": 1.02698052, "epoch": 0.22065233729144745, "flos": 23476122126720.0, "grad_norm": 2.3214837709153855, "language_loss": 0.85482693, "learning_rate": 3.5387602267057227e-06, "loss": 0.87606883, "num_input_tokens_seen": 79141615, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.64453125, "step": 3670, "time_per_iteration": 2.3846724033355713 }, { "auxiliary_loss_clip": 0.01097451, "auxiliary_loss_mlp": 0.01037446, "balance_loss_clip": 1.018718, "balance_loss_mlp": 1.03035593, "epoch": 0.22071246054411545, "flos": 35224116405120.0, "grad_norm": 1.8583426685725373, "language_loss": 0.76785362, "learning_rate": 3.5385188496589516e-06, "loss": 0.78920257, "num_input_tokens_seen": 79164910, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.671875, "step": 3671, "time_per_iteration": 2.4917216300964355 }, { "auxiliary_loss_clip": 0.01094413, "auxiliary_loss_mlp": 0.01037797, "balance_loss_clip": 1.01946259, "balance_loss_mlp": 1.02923059, "epoch": 0.2207725837967834, "flos": 18149318726400.0, "grad_norm": 1.928649615301234, "language_loss": 0.81345391, "learning_rate": 3.5382774177064007e-06, "loss": 0.83477604, "num_input_tokens_seen": 79179685, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.65234375, "step": 3672, "time_per_iteration": 2.3476321697235107 }, { "auxiliary_loss_clip": 0.01095329, "auxiliary_loss_mlp": 0.01043748, "balance_loss_clip": 1.02569973, "balance_loss_mlp": 1.02908397, "epoch": 0.22083270704945138, "flos": 20478793927680.0, "grad_norm": 1.8496931939472288, "language_loss": 0.73409235, "learning_rate": 3.538035930856685e-06, "loss": 0.75548315, "num_input_tokens_seen": 79196285, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.6640625, "step": 3673, "time_per_iteration": 2.371300220489502 }, { "auxiliary_loss_clip": 0.01097022, "auxiliary_loss_mlp": 0.01036525, "balance_loss_clip": 1.01691508, "balance_loss_mlp": 1.03150582, "epoch": 0.22089283030211934, "flos": 34124527134720.0, "grad_norm": 1.8457946037171171, "language_loss": 0.76104242, "learning_rate": 3.5377943891184234e-06, "loss": 0.7823779, "num_input_tokens_seen": 79216060, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 0.65234375, "step": 3674, "time_per_iteration": 2.4701383113861084 }, { "auxiliary_loss_clip": 0.0109407, "auxiliary_loss_mlp": 0.01039001, "balance_loss_clip": 1.01868737, "balance_loss_mlp": 1.02909803, "epoch": 0.2209529535547873, "flos": 18076245517440.0, "grad_norm": 1.890786765170995, "language_loss": 0.7416544, "learning_rate": 3.5375527925002357e-06, "loss": 0.76298511, "num_input_tokens_seen": 79235145, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.6484375, "step": 3675, "time_per_iteration": 2.3533267974853516 }, { "auxiliary_loss_clip": 0.01093822, "auxiliary_loss_mlp": 0.01041447, "balance_loss_clip": 1.02355337, "balance_loss_mlp": 1.02760625, "epoch": 0.22101307680745527, "flos": 27234103317120.0, "grad_norm": 1.6694224754185738, "language_loss": 0.80026603, "learning_rate": 3.537311141010744e-06, "loss": 0.82161874, "num_input_tokens_seen": 79256960, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.6640625, "step": 3676, "time_per_iteration": 2.4297144412994385 }, { "auxiliary_loss_clip": 0.0109421, "auxiliary_loss_mlp": 0.0104276, "balance_loss_clip": 1.02320921, "balance_loss_mlp": 1.02742362, "epoch": 0.22107320006012326, "flos": 16542371445120.0, "grad_norm": 2.0785573910341615, "language_loss": 0.75704879, "learning_rate": 3.5370694346585718e-06, "loss": 0.77841848, "num_input_tokens_seen": 79274860, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.66796875, "step": 3677, "time_per_iteration": 2.4076225757598877 }, { "auxiliary_loss_clip": 0.01089111, "auxiliary_loss_mlp": 0.01038556, "balance_loss_clip": 1.02018619, "balance_loss_mlp": 1.02509761, "epoch": 0.22113332331279123, "flos": 22053376512000.0, "grad_norm": 1.6958729395602445, "language_loss": 0.83066964, "learning_rate": 3.5368276734523457e-06, "loss": 0.85194635, "num_input_tokens_seen": 79294005, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.640625, "step": 3678, "time_per_iteration": 2.39967942237854 }, { "auxiliary_loss_clip": 0.01093189, "auxiliary_loss_mlp": 0.0104082, "balance_loss_clip": 1.02174675, "balance_loss_mlp": 1.02956486, "epoch": 0.2211934465654592, "flos": 26611636953600.0, "grad_norm": 1.7033107421027307, "language_loss": 0.8909936, "learning_rate": 3.536585857400693e-06, "loss": 0.91233373, "num_input_tokens_seen": 79314005, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.63671875, "step": 3679, "time_per_iteration": 2.4268648624420166 }, { "auxiliary_loss_clip": 0.01094554, "auxiliary_loss_mlp": 0.01042149, "balance_loss_clip": 1.02258635, "balance_loss_mlp": 1.02932191, "epoch": 0.22125356981812716, "flos": 16359496410240.0, "grad_norm": 2.1653341397111094, "language_loss": 0.87012517, "learning_rate": 3.5363439865122436e-06, "loss": 0.89149213, "num_input_tokens_seen": 79331030, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.65234375, "step": 3680, "time_per_iteration": 2.3618102073669434 }, { "auxiliary_loss_clip": 0.01091267, "auxiliary_loss_mlp": 0.01040509, "balance_loss_clip": 1.02138829, "balance_loss_mlp": 1.02905679, "epoch": 0.22131369307079513, "flos": 21650094864000.0, "grad_norm": 1.7555962133591956, "language_loss": 0.81380695, "learning_rate": 3.5361020607956292e-06, "loss": 0.83512473, "num_input_tokens_seen": 79348560, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.62109375, "step": 3681, "time_per_iteration": 2.4112987518310547 }, { "auxiliary_loss_clip": 0.01090275, "auxiliary_loss_mlp": 0.01039758, "balance_loss_clip": 1.02131629, "balance_loss_mlp": 1.02861273, "epoch": 0.2213738163234631, "flos": 19608513667200.0, "grad_norm": 2.0406900260686482, "language_loss": 0.79659057, "learning_rate": 3.535860080259484e-06, "loss": 0.81789088, "num_input_tokens_seen": 79367175, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.6171875, "step": 3682, "time_per_iteration": 2.3819100856781006 }, { "auxiliary_loss_clip": 0.01092671, "auxiliary_loss_mlp": 0.01038685, "balance_loss_clip": 1.01928973, "balance_loss_mlp": 1.02692652, "epoch": 0.22143393957613106, "flos": 23622268544640.0, "grad_norm": 1.5759452235882812, "language_loss": 0.77408659, "learning_rate": 3.5356180449124424e-06, "loss": 0.79540014, "num_input_tokens_seen": 79388435, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.65625, "step": 3683, "time_per_iteration": 2.40122652053833 }, { "auxiliary_loss_clip": 0.01092856, "auxiliary_loss_mlp": 0.0104137, "balance_loss_clip": 1.02216589, "balance_loss_mlp": 1.02644992, "epoch": 0.22149406282879905, "flos": 26176584101760.0, "grad_norm": 1.753791680216186, "language_loss": 0.72255617, "learning_rate": 3.535375954763143e-06, "loss": 0.74389839, "num_input_tokens_seen": 79407910, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.6640625, "step": 3684, "time_per_iteration": 2.4123446941375732 }, { "auxiliary_loss_clip": 0.01095201, "auxiliary_loss_mlp": 0.01042766, "balance_loss_clip": 1.02344263, "balance_loss_mlp": 1.02942276, "epoch": 0.221554186081467, "flos": 14537867978880.0, "grad_norm": 1.8519296152254818, "language_loss": 0.79865098, "learning_rate": 3.535133809820226e-06, "loss": 0.82003069, "num_input_tokens_seen": 79424020, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.65625, "step": 3685, "time_per_iteration": 2.3655717372894287 }, { "auxiliary_loss_clip": 0.01088917, "auxiliary_loss_mlp": 0.01043412, "balance_loss_clip": 1.02562642, "balance_loss_mlp": 1.02571535, "epoch": 0.22161430933413498, "flos": 22237124330880.0, "grad_norm": 1.503663529879709, "language_loss": 0.87465549, "learning_rate": 3.5348916100923318e-06, "loss": 0.89597881, "num_input_tokens_seen": 79445605, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.6328125, "step": 3686, "time_per_iteration": 2.425128936767578 }, { "auxiliary_loss_clip": 0.01089607, "auxiliary_loss_mlp": 0.01036597, "balance_loss_clip": 1.01724911, "balance_loss_mlp": 1.02612162, "epoch": 0.22167443258680294, "flos": 23475423899520.0, "grad_norm": 1.8229604735511442, "language_loss": 0.77771688, "learning_rate": 3.534649355588104e-06, "loss": 0.79897892, "num_input_tokens_seen": 79463850, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.63671875, "step": 3687, "time_per_iteration": 2.4009549617767334 }, { "auxiliary_loss_clip": 0.01096379, "auxiliary_loss_mlp": 0.01043615, "balance_loss_clip": 1.02317119, "balance_loss_mlp": 1.02852297, "epoch": 0.2217345558394709, "flos": 23220034237440.0, "grad_norm": 1.7664825785154417, "language_loss": 0.84929752, "learning_rate": 3.534407046316189e-06, "loss": 0.8706975, "num_input_tokens_seen": 79482845, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.6796875, "step": 3688, "time_per_iteration": 3.7730188369750977 }, { "auxiliary_loss_clip": 0.01097376, "auxiliary_loss_mlp": 0.01036841, "balance_loss_clip": 1.01797032, "balance_loss_mlp": 1.02991366, "epoch": 0.22179467909213887, "flos": 20010049747200.0, "grad_norm": 1.68473272440209, "language_loss": 0.8142544, "learning_rate": 3.5341646822852324e-06, "loss": 0.83559656, "num_input_tokens_seen": 79501550, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.671875, "step": 3689, "time_per_iteration": 2.377899646759033 }, { "auxiliary_loss_clip": 0.01091099, "auxiliary_loss_mlp": 0.010402, "balance_loss_clip": 1.02169847, "balance_loss_mlp": 1.02827299, "epoch": 0.22185480234480687, "flos": 19682005812480.0, "grad_norm": 1.7421215786775817, "language_loss": 0.69994974, "learning_rate": 3.5339222635038852e-06, "loss": 0.72126275, "num_input_tokens_seen": 79519680, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.62890625, "step": 3690, "time_per_iteration": 3.8241169452667236 }, { "auxiliary_loss_clip": 0.01093793, "auxiliary_loss_mlp": 0.01037736, "balance_loss_clip": 1.01738739, "balance_loss_mlp": 1.02620494, "epoch": 0.22191492559747483, "flos": 21980233480320.0, "grad_norm": 1.8785714147008459, "language_loss": 0.72514445, "learning_rate": 3.533679789980798e-06, "loss": 0.74645978, "num_input_tokens_seen": 79539000, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.67578125, "step": 3691, "time_per_iteration": 2.3743176460266113 }, { "auxiliary_loss_clip": 0.0109577, "auxiliary_loss_mlp": 0.01037011, "balance_loss_clip": 1.01713848, "balance_loss_mlp": 1.03043437, "epoch": 0.2219750488501428, "flos": 23220941932800.0, "grad_norm": 1.8898359293696025, "language_loss": 0.71459144, "learning_rate": 3.5334372617246243e-06, "loss": 0.73591924, "num_input_tokens_seen": 79559695, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.65234375, "step": 3692, "time_per_iteration": 3.8005530834198 }, { "auxiliary_loss_clip": 0.0109597, "auxiliary_loss_mlp": 0.01042106, "balance_loss_clip": 1.02129185, "balance_loss_mlp": 1.0287292, "epoch": 0.22203517210281076, "flos": 22452643353600.0, "grad_norm": 1.5534491366171796, "language_loss": 0.88025165, "learning_rate": 3.533194678744019e-06, "loss": 0.90163249, "num_input_tokens_seen": 79579095, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.671875, "step": 3693, "time_per_iteration": 2.387392520904541 }, { "auxiliary_loss_clip": 0.01089581, "auxiliary_loss_mlp": 0.01032849, "balance_loss_clip": 1.01600492, "balance_loss_mlp": 1.02734554, "epoch": 0.22209529535547873, "flos": 17563650802560.0, "grad_norm": 2.050372563762818, "language_loss": 0.85483646, "learning_rate": 3.53295204104764e-06, "loss": 0.87606072, "num_input_tokens_seen": 79596430, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.62109375, "step": 3694, "time_per_iteration": 2.3603076934814453 }, { "auxiliary_loss_clip": 0.01093952, "auxiliary_loss_mlp": 0.01041397, "balance_loss_clip": 1.02131057, "balance_loss_mlp": 1.02634811, "epoch": 0.2221554186081467, "flos": 21467987879040.0, "grad_norm": 2.895360832649079, "language_loss": 0.69272387, "learning_rate": 3.532709348644146e-06, "loss": 0.71407735, "num_input_tokens_seen": 79615825, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.67578125, "step": 3695, "time_per_iteration": 3.7850799560546875 }, { "auxiliary_loss_clip": 0.01090176, "auxiliary_loss_mlp": 0.01032612, "balance_loss_clip": 1.01567209, "balance_loss_mlp": 1.02860951, "epoch": 0.22221554186081466, "flos": 27672193457280.0, "grad_norm": 1.5009527281737483, "language_loss": 0.71522045, "learning_rate": 3.532466601542197e-06, "loss": 0.73644829, "num_input_tokens_seen": 79637875, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.6171875, "step": 3696, "time_per_iteration": 2.4492974281311035 }, { "auxiliary_loss_clip": 0.01091476, "auxiliary_loss_mlp": 0.01035465, "balance_loss_clip": 1.0167017, "balance_loss_mlp": 1.02732062, "epoch": 0.22227566511348265, "flos": 25957713588480.0, "grad_norm": 1.740135248140743, "language_loss": 0.87784004, "learning_rate": 3.532223799750458e-06, "loss": 0.89910948, "num_input_tokens_seen": 79656970, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.640625, "step": 3697, "time_per_iteration": 2.404921293258667 }, { "auxiliary_loss_clip": 0.01085887, "auxiliary_loss_mlp": 0.01033762, "balance_loss_clip": 1.01768041, "balance_loss_mlp": 1.02554131, "epoch": 0.22233578836615062, "flos": 39202085272320.0, "grad_norm": 1.534990969190034, "language_loss": 0.66207892, "learning_rate": 3.5319809432775916e-06, "loss": 0.6832754, "num_input_tokens_seen": 79680275, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.6015625, "step": 3698, "time_per_iteration": 2.548260450363159 }, { "auxiliary_loss_clip": 0.0109311, "auxiliary_loss_mlp": 0.01035733, "balance_loss_clip": 1.01574123, "balance_loss_mlp": 1.02653658, "epoch": 0.22239591161881858, "flos": 36282298936320.0, "grad_norm": 1.8441220316297018, "language_loss": 0.82402086, "learning_rate": 3.531738032132267e-06, "loss": 0.84530926, "num_input_tokens_seen": 79701255, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.6640625, "step": 3699, "time_per_iteration": 2.4940197467803955 }, { "auxiliary_loss_clip": 0.01091321, "auxiliary_loss_mlp": 0.01036938, "balance_loss_clip": 1.01679158, "balance_loss_mlp": 1.02728462, "epoch": 0.22245603487148655, "flos": 19718559872640.0, "grad_norm": 1.8064773091700361, "language_loss": 0.79589581, "learning_rate": 3.531495066323152e-06, "loss": 0.81717837, "num_input_tokens_seen": 79721315, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.640625, "step": 3700, "time_per_iteration": 2.3791542053222656 }, { "auxiliary_loss_clip": 0.01097251, "auxiliary_loss_mlp": 0.01036792, "balance_loss_clip": 1.01687169, "balance_loss_mlp": 1.02991736, "epoch": 0.2225161581241545, "flos": 46278700704000.0, "grad_norm": 1.9619386117639426, "language_loss": 0.72068286, "learning_rate": 3.5312520458589176e-06, "loss": 0.74202335, "num_input_tokens_seen": 79742705, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.671875, "step": 3701, "time_per_iteration": 2.5742242336273193 }, { "auxiliary_loss_clip": 0.01090285, "auxiliary_loss_mlp": 0.01031364, "balance_loss_clip": 1.01376891, "balance_loss_mlp": 1.02655578, "epoch": 0.22257628137682248, "flos": 23695062462720.0, "grad_norm": 1.7292443873516452, "language_loss": 0.80025822, "learning_rate": 3.5310089707482366e-06, "loss": 0.82147467, "num_input_tokens_seen": 79763000, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.63671875, "step": 3702, "time_per_iteration": 2.3922669887542725 }, { "auxiliary_loss_clip": 0.01084204, "auxiliary_loss_mlp": 0.01035934, "balance_loss_clip": 1.01753962, "balance_loss_mlp": 1.02507901, "epoch": 0.22263640462949044, "flos": 19352984359680.0, "grad_norm": 1.942009494971027, "language_loss": 0.78257668, "learning_rate": 3.5307658409997834e-06, "loss": 0.80377805, "num_input_tokens_seen": 79781335, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.58984375, "step": 3703, "time_per_iteration": 2.350100040435791 }, { "auxiliary_loss_clip": 0.01090873, "auxiliary_loss_mlp": 0.01037792, "balance_loss_clip": 1.01740718, "balance_loss_mlp": 1.02613699, "epoch": 0.22269652788215843, "flos": 20775031747200.0, "grad_norm": 1.8852992213241526, "language_loss": 0.75073087, "learning_rate": 3.530522656622235e-06, "loss": 0.7720176, "num_input_tokens_seen": 79800150, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.6484375, "step": 3704, "time_per_iteration": 2.369068145751953 }, { "auxiliary_loss_clip": 0.01088072, "auxiliary_loss_mlp": 0.01033246, "balance_loss_clip": 1.0162226, "balance_loss_mlp": 1.02771926, "epoch": 0.2227566511348264, "flos": 47957045448960.0, "grad_norm": 1.9031711690997903, "language_loss": 0.64479697, "learning_rate": 3.53027941762427e-06, "loss": 0.66601014, "num_input_tokens_seen": 79822390, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.6015625, "step": 3705, "time_per_iteration": 2.602240800857544 }, { "auxiliary_loss_clip": 0.01017685, "auxiliary_loss_mlp": 0.01001016, "balance_loss_clip": 0.998954, "balance_loss_mlp": 1.0022943, "epoch": 0.22281677438749437, "flos": 66216166001280.0, "grad_norm": 1.2868087929793475, "language_loss": 0.65118122, "learning_rate": 3.5300361240145692e-06, "loss": 0.67136824, "num_input_tokens_seen": 79873350, "router_z_loss_clip": 0.02062988, "router_z_loss_mlp": 0.15429688, "step": 3706, "time_per_iteration": 2.882190704345703 }, { "auxiliary_loss_clip": 0.01088894, "auxiliary_loss_mlp": 0.01033967, "balance_loss_clip": 1.01579905, "balance_loss_mlp": 1.02643394, "epoch": 0.22287689764016233, "flos": 21870536388480.0, "grad_norm": 1.7319909772015547, "language_loss": 0.80544299, "learning_rate": 3.5297927758018147e-06, "loss": 0.8266716, "num_input_tokens_seen": 79891715, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.625, "step": 3707, "time_per_iteration": 2.378695249557495 }, { "auxiliary_loss_clip": 0.01089534, "auxiliary_loss_mlp": 0.01032534, "balance_loss_clip": 1.01420009, "balance_loss_mlp": 1.02666414, "epoch": 0.2229370208928303, "flos": 27671250850560.0, "grad_norm": 1.9187304729061032, "language_loss": 0.78919291, "learning_rate": 3.5295493729946913e-06, "loss": 0.8104136, "num_input_tokens_seen": 79911175, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.62890625, "step": 3708, "time_per_iteration": 2.4452548027038574 }, { "auxiliary_loss_clip": 0.01092096, "auxiliary_loss_mlp": 0.01038704, "balance_loss_clip": 1.02138281, "balance_loss_mlp": 1.02801836, "epoch": 0.22299714414549826, "flos": 30153331071360.0, "grad_norm": 1.9911645034789165, "language_loss": 0.80301565, "learning_rate": 3.529305915601885e-06, "loss": 0.82432365, "num_input_tokens_seen": 79931875, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.640625, "step": 3709, "time_per_iteration": 2.4480786323547363 }, { "auxiliary_loss_clip": 0.01089464, "auxiliary_loss_mlp": 0.01041277, "balance_loss_clip": 1.02328813, "balance_loss_mlp": 1.0251708, "epoch": 0.22305726739816625, "flos": 23142178172160.0, "grad_norm": 1.9254495213443301, "language_loss": 0.68630362, "learning_rate": 3.5290624036320843e-06, "loss": 0.70761108, "num_input_tokens_seen": 79952445, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.640625, "step": 3710, "time_per_iteration": 2.4239273071289062 }, { "auxiliary_loss_clip": 0.01093177, "auxiliary_loss_mlp": 0.01037456, "balance_loss_clip": 1.01965773, "balance_loss_mlp": 1.02766013, "epoch": 0.22311739065083422, "flos": 19171051931520.0, "grad_norm": 2.317621370224953, "language_loss": 0.90193641, "learning_rate": 3.5288188370939796e-06, "loss": 0.92324269, "num_input_tokens_seen": 79971030, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.65625, "step": 3711, "time_per_iteration": 2.3515431880950928 }, { "auxiliary_loss_clip": 0.01088843, "auxiliary_loss_mlp": 0.01040751, "balance_loss_clip": 1.02275085, "balance_loss_mlp": 1.02645743, "epoch": 0.22317751390350218, "flos": 13617138936960.0, "grad_norm": 4.652123448453264, "language_loss": 0.89180648, "learning_rate": 3.5285752159962636e-06, "loss": 0.91310239, "num_input_tokens_seen": 79982085, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.625, "step": 3712, "time_per_iteration": 2.307796001434326 }, { "auxiliary_loss_clip": 0.01090042, "auxiliary_loss_mlp": 0.01038735, "balance_loss_clip": 1.01963782, "balance_loss_mlp": 1.02730691, "epoch": 0.22323763715617015, "flos": 11028468735360.0, "grad_norm": 3.9441240187166886, "language_loss": 0.74791253, "learning_rate": 3.5283315403476293e-06, "loss": 0.76920033, "num_input_tokens_seen": 79997460, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.625, "step": 3713, "time_per_iteration": 2.323309898376465 }, { "auxiliary_loss_clip": 0.01091054, "auxiliary_loss_mlp": 0.01038528, "balance_loss_clip": 1.01916838, "balance_loss_mlp": 1.02821803, "epoch": 0.22329776040883811, "flos": 41350012070400.0, "grad_norm": 2.2935807959358128, "language_loss": 0.62543035, "learning_rate": 3.5280878101567746e-06, "loss": 0.64672613, "num_input_tokens_seen": 80022450, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.62890625, "step": 3714, "time_per_iteration": 2.562187671661377 }, { "auxiliary_loss_clip": 0.01089448, "auxiliary_loss_mlp": 0.01030597, "balance_loss_clip": 1.01393175, "balance_loss_mlp": 1.02717614, "epoch": 0.22335788366150608, "flos": 25118296836480.0, "grad_norm": 2.254111070255291, "language_loss": 0.79423189, "learning_rate": 3.527844025432396e-06, "loss": 0.81543237, "num_input_tokens_seen": 80042100, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.625, "step": 3715, "time_per_iteration": 2.4427640438079834 }, { "auxiliary_loss_clip": 0.01092928, "auxiliary_loss_mlp": 0.01043068, "balance_loss_clip": 1.02509141, "balance_loss_mlp": 1.02842188, "epoch": 0.22341800691417404, "flos": 16982416621440.0, "grad_norm": 1.6876057861486706, "language_loss": 0.76629359, "learning_rate": 3.5276001861831945e-06, "loss": 0.78765357, "num_input_tokens_seen": 80059690, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.6484375, "step": 3716, "time_per_iteration": 2.3579540252685547 }, { "auxiliary_loss_clip": 0.01091692, "auxiliary_loss_mlp": 0.01042045, "balance_loss_clip": 1.02359092, "balance_loss_mlp": 1.0271709, "epoch": 0.22347813016684204, "flos": 14135878051200.0, "grad_norm": 2.5328957448662504, "language_loss": 0.789105, "learning_rate": 3.527356292417872e-06, "loss": 0.81044239, "num_input_tokens_seen": 80076060, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.64453125, "step": 3717, "time_per_iteration": 2.3554940223693848 }, { "auxiliary_loss_clip": 0.01091346, "auxiliary_loss_mlp": 0.01041562, "balance_loss_clip": 1.02334726, "balance_loss_mlp": 1.0271852, "epoch": 0.22353825341951, "flos": 23582118614400.0, "grad_norm": 1.81745241849482, "language_loss": 0.68541479, "learning_rate": 3.527112344145132e-06, "loss": 0.70674384, "num_input_tokens_seen": 80094760, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.640625, "step": 3718, "time_per_iteration": 2.411811113357544 }, { "auxiliary_loss_clip": 0.01091912, "auxiliary_loss_mlp": 0.01034974, "balance_loss_clip": 1.01513803, "balance_loss_mlp": 1.02732539, "epoch": 0.22359837667217797, "flos": 29822948075520.0, "grad_norm": 1.6166210451380636, "language_loss": 0.80225945, "learning_rate": 3.5268683413736808e-06, "loss": 0.82352829, "num_input_tokens_seen": 80114475, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.64453125, "step": 3719, "time_per_iteration": 2.4607787132263184 }, { "auxiliary_loss_clip": 0.0109472, "auxiliary_loss_mlp": 0.01043127, "balance_loss_clip": 1.02308762, "balance_loss_mlp": 1.02672648, "epoch": 0.22365849992484593, "flos": 17602124987520.0, "grad_norm": 2.78199127452028, "language_loss": 0.86761129, "learning_rate": 3.526624284112226e-06, "loss": 0.88898981, "num_input_tokens_seen": 80132920, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.6796875, "step": 3720, "time_per_iteration": 2.3771848678588867 }, { "auxiliary_loss_clip": 0.0108845, "auxiliary_loss_mlp": 0.01033403, "balance_loss_clip": 1.01518726, "balance_loss_mlp": 1.02660191, "epoch": 0.2237186231775139, "flos": 22709848406400.0, "grad_norm": 1.5872777188625669, "language_loss": 0.74140322, "learning_rate": 3.5263801723694774e-06, "loss": 0.76262176, "num_input_tokens_seen": 80152845, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.6171875, "step": 3721, "time_per_iteration": 2.3775877952575684 }, { "auxiliary_loss_clip": 0.01092908, "auxiliary_loss_mlp": 0.01037039, "balance_loss_clip": 1.01834702, "balance_loss_mlp": 1.02702498, "epoch": 0.22377874643018186, "flos": 13370651671680.0, "grad_norm": 1.954124855675532, "language_loss": 0.79196149, "learning_rate": 3.5261360061541464e-06, "loss": 0.81326091, "num_input_tokens_seen": 80170680, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.65625, "step": 3722, "time_per_iteration": 2.3554270267486572 }, { "auxiliary_loss_clip": 0.01088159, "auxiliary_loss_mlp": 0.01031285, "balance_loss_clip": 1.01399994, "balance_loss_mlp": 1.02750111, "epoch": 0.22383886968284986, "flos": 17893998887040.0, "grad_norm": 1.9514543259974668, "language_loss": 0.81833661, "learning_rate": 3.5258917854749476e-06, "loss": 0.839531, "num_input_tokens_seen": 80189030, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.609375, "step": 3723, "time_per_iteration": 2.3552000522613525 }, { "auxiliary_loss_clip": 0.01091717, "auxiliary_loss_mlp": 0.01036152, "balance_loss_clip": 1.01831865, "balance_loss_mlp": 1.02737498, "epoch": 0.22389899293551782, "flos": 23877972408960.0, "grad_norm": 2.2324367537490097, "language_loss": 0.84569204, "learning_rate": 3.5256475103405957e-06, "loss": 0.86697072, "num_input_tokens_seen": 80208365, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.640625, "step": 3724, "time_per_iteration": 2.4105513095855713 }, { "auxiliary_loss_clip": 0.01088083, "auxiliary_loss_mlp": 0.01033511, "balance_loss_clip": 1.01564169, "balance_loss_mlp": 1.0258913, "epoch": 0.2239591161881858, "flos": 27271181047680.0, "grad_norm": 2.9199364331410944, "language_loss": 0.78727692, "learning_rate": 3.525403180759809e-06, "loss": 0.8084929, "num_input_tokens_seen": 80228685, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.62109375, "step": 3725, "time_per_iteration": 2.414835214614868 }, { "auxiliary_loss_clip": 0.01089535, "auxiliary_loss_mlp": 0.0103743, "balance_loss_clip": 1.01965594, "balance_loss_mlp": 1.02788854, "epoch": 0.22401923944085375, "flos": 22235762787840.0, "grad_norm": 1.7324304899188787, "language_loss": 0.77203864, "learning_rate": 3.5251587967413065e-06, "loss": 0.79330832, "num_input_tokens_seen": 80247635, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.6171875, "step": 3726, "time_per_iteration": 2.375723123550415 }, { "auxiliary_loss_clip": 0.01098139, "auxiliary_loss_mlp": 0.01038683, "balance_loss_clip": 1.01871562, "balance_loss_mlp": 1.02910829, "epoch": 0.22407936269352172, "flos": 12052959937920.0, "grad_norm": 2.304369136807236, "language_loss": 0.72655082, "learning_rate": 3.5249143582938096e-06, "loss": 0.74791902, "num_input_tokens_seen": 80260045, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.69140625, "step": 3727, "time_per_iteration": 3.7323873043060303 }, { "auxiliary_loss_clip": 0.0109475, "auxiliary_loss_mlp": 0.01035442, "balance_loss_clip": 1.01540256, "balance_loss_mlp": 1.02777815, "epoch": 0.22413948594618968, "flos": 19352565423360.0, "grad_norm": 1.9528337117237478, "language_loss": 0.87160379, "learning_rate": 3.5246698654260416e-06, "loss": 0.89290571, "num_input_tokens_seen": 80277680, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.66796875, "step": 3728, "time_per_iteration": 2.378406524658203 }, { "auxiliary_loss_clip": 0.01092542, "auxiliary_loss_mlp": 0.010429, "balance_loss_clip": 1.02264631, "balance_loss_mlp": 1.02828956, "epoch": 0.22419960919885765, "flos": 24168868790400.0, "grad_norm": 2.361303933533291, "language_loss": 0.80444628, "learning_rate": 3.5244253181467284e-06, "loss": 0.82580072, "num_input_tokens_seen": 80294795, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.64453125, "step": 3729, "time_per_iteration": 3.749037742614746 }, { "auxiliary_loss_clip": 0.01087705, "auxiliary_loss_mlp": 0.01042811, "balance_loss_clip": 1.02552557, "balance_loss_mlp": 1.02664852, "epoch": 0.22425973245152564, "flos": 27377805939840.0, "grad_norm": 1.6124074225621932, "language_loss": 0.86935675, "learning_rate": 3.5241807164645963e-06, "loss": 0.89066195, "num_input_tokens_seen": 80315425, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.609375, "step": 3730, "time_per_iteration": 2.427229881286621 }, { "auxiliary_loss_clip": 0.01086321, "auxiliary_loss_mlp": 0.01029149, "balance_loss_clip": 1.01183999, "balance_loss_mlp": 1.0268898, "epoch": 0.2243198557041936, "flos": 13734795818880.0, "grad_norm": 1.7947927727917496, "language_loss": 0.7302593, "learning_rate": 3.5239360603883754e-06, "loss": 0.75141394, "num_input_tokens_seen": 80333905, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.59375, "step": 3731, "time_per_iteration": 2.356825113296509 }, { "auxiliary_loss_clip": 0.01091264, "auxiliary_loss_mlp": 0.01033938, "balance_loss_clip": 1.01621199, "balance_loss_mlp": 1.02796686, "epoch": 0.22437997895686157, "flos": 19529854640640.0, "grad_norm": 1.7613538554927222, "language_loss": 0.75165671, "learning_rate": 3.523691349926797e-06, "loss": 0.77290875, "num_input_tokens_seen": 80352165, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.6328125, "step": 3732, "time_per_iteration": 3.7950668334960938 }, { "auxiliary_loss_clip": 0.01092827, "auxiliary_loss_mlp": 0.01037561, "balance_loss_clip": 1.01928651, "balance_loss_mlp": 1.02970839, "epoch": 0.22444010220952954, "flos": 23695097374080.0, "grad_norm": 1.8654483444328418, "language_loss": 0.88087487, "learning_rate": 3.523446585088593e-06, "loss": 0.90217876, "num_input_tokens_seen": 80371305, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.6328125, "step": 3733, "time_per_iteration": 2.41096830368042 }, { "auxiliary_loss_clip": 0.01089176, "auxiliary_loss_mlp": 0.01037158, "balance_loss_clip": 1.01857305, "balance_loss_mlp": 1.02695596, "epoch": 0.2245002254621975, "flos": 22381804471680.0, "grad_norm": 1.4733569513093463, "language_loss": 0.84390181, "learning_rate": 3.5232017658825e-06, "loss": 0.86516517, "num_input_tokens_seen": 80391020, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.625, "step": 3734, "time_per_iteration": 2.37752366065979 }, { "auxiliary_loss_clip": 0.01092387, "auxiliary_loss_mlp": 0.01041879, "balance_loss_clip": 1.02395022, "balance_loss_mlp": 1.02973032, "epoch": 0.22456034871486547, "flos": 26941112254080.0, "grad_norm": 2.1503147038180583, "language_loss": 0.76111364, "learning_rate": 3.522956892317253e-06, "loss": 0.78245628, "num_input_tokens_seen": 80411365, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.625, "step": 3735, "time_per_iteration": 3.800471782684326 }, { "auxiliary_loss_clip": 0.01086445, "auxiliary_loss_mlp": 0.01037819, "balance_loss_clip": 1.02121353, "balance_loss_mlp": 1.02820969, "epoch": 0.22462047196753343, "flos": 28982344337280.0, "grad_norm": 1.6132254381629896, "language_loss": 0.84712738, "learning_rate": 3.5227119644015922e-06, "loss": 0.86837006, "num_input_tokens_seen": 80431075, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.58203125, "step": 3736, "time_per_iteration": 2.451631546020508 }, { "auxiliary_loss_clip": 0.01090463, "auxiliary_loss_mlp": 0.01036675, "balance_loss_clip": 1.01853168, "balance_loss_mlp": 1.02825832, "epoch": 0.22468059522020142, "flos": 20010294126720.0, "grad_norm": 1.6984357524143952, "language_loss": 0.86714351, "learning_rate": 3.5224669821442586e-06, "loss": 0.88841492, "num_input_tokens_seen": 80449240, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.62109375, "step": 3737, "time_per_iteration": 2.40842866897583 }, { "auxiliary_loss_clip": 0.01091119, "auxiliary_loss_mlp": 0.01042948, "balance_loss_clip": 1.02404153, "balance_loss_mlp": 1.02781999, "epoch": 0.2247407184728694, "flos": 29312971712640.0, "grad_norm": 1.7691706746175149, "language_loss": 0.7931546, "learning_rate": 3.522221945553995e-06, "loss": 0.81449533, "num_input_tokens_seen": 80467900, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.6328125, "step": 3738, "time_per_iteration": 2.4300787448883057 }, { "auxiliary_loss_clip": 0.01090441, "auxiliary_loss_mlp": 0.01035812, "balance_loss_clip": 1.01818752, "balance_loss_mlp": 1.02712739, "epoch": 0.22480084172553735, "flos": 22309254933120.0, "grad_norm": 1.538876126115887, "language_loss": 0.76541984, "learning_rate": 3.521976854639546e-06, "loss": 0.78668243, "num_input_tokens_seen": 80487100, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.6328125, "step": 3739, "time_per_iteration": 2.3852591514587402 }, { "auxiliary_loss_clip": 0.01088489, "auxiliary_loss_mlp": 0.01035824, "balance_loss_clip": 1.01801443, "balance_loss_mlp": 1.0266794, "epoch": 0.22486096497820532, "flos": 25590148128000.0, "grad_norm": 1.7292536875838214, "language_loss": 0.74429131, "learning_rate": 3.5217317094096576e-06, "loss": 0.7655344, "num_input_tokens_seen": 80508625, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.6171875, "step": 3740, "time_per_iteration": 2.423879384994507 }, { "auxiliary_loss_clip": 0.01088777, "auxiliary_loss_mlp": 0.01031744, "balance_loss_clip": 1.01453042, "balance_loss_mlp": 1.02751994, "epoch": 0.22492108823087328, "flos": 17638853604480.0, "grad_norm": 1.6801050871333163, "language_loss": 0.75905859, "learning_rate": 3.5214865098730785e-06, "loss": 0.78026378, "num_input_tokens_seen": 80527345, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.61328125, "step": 3741, "time_per_iteration": 2.361703872680664 }, { "auxiliary_loss_clip": 0.01088098, "auxiliary_loss_mlp": 0.01032122, "balance_loss_clip": 1.01457429, "balance_loss_mlp": 1.02740383, "epoch": 0.22498121148354125, "flos": 16033721713920.0, "grad_norm": 1.6865714909942253, "language_loss": 0.87917626, "learning_rate": 3.52124125603856e-06, "loss": 0.90037847, "num_input_tokens_seen": 80545545, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.60546875, "step": 3742, "time_per_iteration": 2.3657896518707275 }, { "auxiliary_loss_clip": 0.01087726, "auxiliary_loss_mlp": 0.01038411, "balance_loss_clip": 1.02014816, "balance_loss_mlp": 1.02707016, "epoch": 0.22504133473620924, "flos": 24022652549760.0, "grad_norm": 1.645102043551272, "language_loss": 0.81376117, "learning_rate": 3.520995947914854e-06, "loss": 0.83502257, "num_input_tokens_seen": 80565040, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.609375, "step": 3743, "time_per_iteration": 2.4094078540802 }, { "auxiliary_loss_clip": 0.01089297, "auxiliary_loss_mlp": 0.01030499, "balance_loss_clip": 1.01363063, "balance_loss_mlp": 1.02649498, "epoch": 0.2251014579888772, "flos": 16763022437760.0, "grad_norm": 1.9188485400483928, "language_loss": 0.63366693, "learning_rate": 3.520750585510715e-06, "loss": 0.65486485, "num_input_tokens_seen": 80582815, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.62890625, "step": 3744, "time_per_iteration": 2.356104850769043 }, { "auxiliary_loss_clip": 0.0108808, "auxiliary_loss_mlp": 0.01036718, "balance_loss_clip": 1.01964712, "balance_loss_mlp": 1.02572632, "epoch": 0.22516158124154517, "flos": 13990150569600.0, "grad_norm": 2.771667982573645, "language_loss": 0.76202762, "learning_rate": 3.5205051688348997e-06, "loss": 0.7832756, "num_input_tokens_seen": 80600865, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.625, "step": 3745, "time_per_iteration": 2.3604066371917725 }, { "auxiliary_loss_clip": 0.01087369, "auxiliary_loss_mlp": 0.01036915, "balance_loss_clip": 1.01923621, "balance_loss_mlp": 1.02590382, "epoch": 0.22522170449421314, "flos": 14389207943040.0, "grad_norm": 1.9527389856733632, "language_loss": 0.80728346, "learning_rate": 3.520259697896166e-06, "loss": 0.82852626, "num_input_tokens_seen": 80617455, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.61328125, "step": 3746, "time_per_iteration": 2.345534324645996 }, { "auxiliary_loss_clip": 0.0108884, "auxiliary_loss_mlp": 0.0103852, "balance_loss_clip": 1.02028084, "balance_loss_mlp": 1.02645183, "epoch": 0.2252818277468811, "flos": 23804410440960.0, "grad_norm": 2.1405810057766455, "language_loss": 0.86256254, "learning_rate": 3.5200141727032744e-06, "loss": 0.88383621, "num_input_tokens_seen": 80635125, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.625, "step": 3747, "time_per_iteration": 2.391200304031372 }, { "auxiliary_loss_clip": 0.01087546, "auxiliary_loss_mlp": 0.010311, "balance_loss_clip": 1.01382697, "balance_loss_mlp": 1.02491987, "epoch": 0.22534195099954907, "flos": 24716865490560.0, "grad_norm": 1.869984033909797, "language_loss": 0.76360589, "learning_rate": 3.519768593264987e-06, "loss": 0.78479242, "num_input_tokens_seen": 80656370, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.625, "step": 3748, "time_per_iteration": 2.4164044857025146 }, { "auxiliary_loss_clip": 0.01091219, "auxiliary_loss_mlp": 0.01033318, "balance_loss_clip": 1.01700997, "balance_loss_mlp": 1.02786016, "epoch": 0.22540207425221703, "flos": 21031294193280.0, "grad_norm": 1.5998245596982745, "language_loss": 0.79927492, "learning_rate": 3.519522959590068e-06, "loss": 0.82052028, "num_input_tokens_seen": 80676495, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.6328125, "step": 3749, "time_per_iteration": 2.3905251026153564 }, { "auxiliary_loss_clip": 0.01084441, "auxiliary_loss_mlp": 0.01034705, "balance_loss_clip": 1.01809907, "balance_loss_mlp": 1.02500296, "epoch": 0.22546219750488503, "flos": 19389363863040.0, "grad_norm": 1.5118862951767982, "language_loss": 0.79424167, "learning_rate": 3.5192772716872827e-06, "loss": 0.81543308, "num_input_tokens_seen": 80694755, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.59375, "step": 3750, "time_per_iteration": 2.358569860458374 }, { "auxiliary_loss_clip": 0.01090923, "auxiliary_loss_mlp": 0.01036412, "balance_loss_clip": 1.0186379, "balance_loss_mlp": 1.02761602, "epoch": 0.225522320757553, "flos": 25191439868160.0, "grad_norm": 1.724506224620897, "language_loss": 0.8158868, "learning_rate": 3.5190315295653996e-06, "loss": 0.83716011, "num_input_tokens_seen": 80713670, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.6328125, "step": 3751, "time_per_iteration": 2.4105162620544434 }, { "auxiliary_loss_clip": 0.01091251, "auxiliary_loss_mlp": 0.01035425, "balance_loss_clip": 1.01785338, "balance_loss_mlp": 1.02820849, "epoch": 0.22558244401022096, "flos": 17162219456640.0, "grad_norm": 1.9508434743411043, "language_loss": 0.83576322, "learning_rate": 3.518785733233189e-06, "loss": 0.85702997, "num_input_tokens_seen": 80731450, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.62890625, "step": 3752, "time_per_iteration": 2.3523709774017334 }, { "auxiliary_loss_clip": 0.01087461, "auxiliary_loss_mlp": 0.01032953, "balance_loss_clip": 1.01646626, "balance_loss_mlp": 1.02681637, "epoch": 0.22564256726288892, "flos": 15230125883520.0, "grad_norm": 1.6685247434861339, "language_loss": 0.78270149, "learning_rate": 3.518539882699422e-06, "loss": 0.80390561, "num_input_tokens_seen": 80748415, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.60546875, "step": 3753, "time_per_iteration": 2.360663890838623 }, { "auxiliary_loss_clip": 0.01086084, "auxiliary_loss_mlp": 0.01036631, "balance_loss_clip": 1.01841629, "balance_loss_mlp": 1.02530885, "epoch": 0.2257026905155569, "flos": 34567225574400.0, "grad_norm": 2.207977685473492, "language_loss": 0.7851181, "learning_rate": 3.518293977972873e-06, "loss": 0.80634522, "num_input_tokens_seen": 80770835, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.60546875, "step": 3754, "time_per_iteration": 2.481651544570923 }, { "auxiliary_loss_clip": 0.01088618, "auxiliary_loss_mlp": 0.01033595, "balance_loss_clip": 1.01536787, "balance_loss_mlp": 1.02941537, "epoch": 0.22576281376822485, "flos": 19937395474560.0, "grad_norm": 4.7922080843468144, "language_loss": 0.70141995, "learning_rate": 3.5180480190623173e-06, "loss": 0.72264206, "num_input_tokens_seen": 80787840, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.59375, "step": 3755, "time_per_iteration": 2.3769328594207764 }, { "auxiliary_loss_clip": 0.01093101, "auxiliary_loss_mlp": 0.01042235, "balance_loss_clip": 1.02322125, "balance_loss_mlp": 1.02921903, "epoch": 0.22582293702089282, "flos": 24601023999360.0, "grad_norm": 2.122203370092252, "language_loss": 0.77696723, "learning_rate": 3.517802005976533e-06, "loss": 0.79832059, "num_input_tokens_seen": 80806335, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.640625, "step": 3756, "time_per_iteration": 2.3991057872772217 }, { "auxiliary_loss_clip": 0.01090352, "auxiliary_loss_mlp": 0.01037603, "balance_loss_clip": 1.0203414, "balance_loss_mlp": 1.0274297, "epoch": 0.2258830602735608, "flos": 23034436116480.0, "grad_norm": 1.6967341676801726, "language_loss": 0.82532358, "learning_rate": 3.5175559387242988e-06, "loss": 0.8466031, "num_input_tokens_seen": 80825355, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.62890625, "step": 3757, "time_per_iteration": 2.410648822784424 }, { "auxiliary_loss_clip": 0.01089515, "auxiliary_loss_mlp": 0.01031701, "balance_loss_clip": 1.01335442, "balance_loss_mlp": 1.0271461, "epoch": 0.22594318352622877, "flos": 22157487786240.0, "grad_norm": 1.7322563283534598, "language_loss": 0.73117363, "learning_rate": 3.517309817314397e-06, "loss": 0.75238585, "num_input_tokens_seen": 80842570, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.625, "step": 3758, "time_per_iteration": 2.3524677753448486 }, { "auxiliary_loss_clip": 0.01092195, "auxiliary_loss_mlp": 0.01040809, "balance_loss_clip": 1.02203381, "balance_loss_mlp": 1.02854705, "epoch": 0.22600330677889674, "flos": 20593273875840.0, "grad_norm": 2.1786804289924566, "language_loss": 0.77346629, "learning_rate": 3.5170636417556113e-06, "loss": 0.79479635, "num_input_tokens_seen": 80858745, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.63671875, "step": 3759, "time_per_iteration": 2.3560667037963867 }, { "auxiliary_loss_clip": 0.01091526, "auxiliary_loss_mlp": 0.01036689, "balance_loss_clip": 1.01812804, "balance_loss_mlp": 1.02664661, "epoch": 0.2260634300315647, "flos": 35658436118400.0, "grad_norm": 2.3520698153818103, "language_loss": 0.78370064, "learning_rate": 3.516817412056726e-06, "loss": 0.80498278, "num_input_tokens_seen": 80880085, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.6484375, "step": 3760, "time_per_iteration": 2.4952027797698975 }, { "auxiliary_loss_clip": 0.01017567, "auxiliary_loss_mlp": 0.01005238, "balance_loss_clip": 1.00330639, "balance_loss_mlp": 1.00258195, "epoch": 0.22612355328423267, "flos": 72087579699840.0, "grad_norm": 0.9459283340013351, "language_loss": 0.60087264, "learning_rate": 3.516571128226529e-06, "loss": 0.62110072, "num_input_tokens_seen": 80937660, "router_z_loss_clip": 0.01928711, "router_z_loss_mlp": 0.15039062, "step": 3761, "time_per_iteration": 2.972327470779419 }, { "auxiliary_loss_clip": 0.01091507, "auxiliary_loss_mlp": 0.01038033, "balance_loss_clip": 1.01953173, "balance_loss_mlp": 1.02764082, "epoch": 0.22618367653690064, "flos": 22782677235840.0, "grad_norm": 1.9425600130414724, "language_loss": 0.7698741, "learning_rate": 3.51632479027381e-06, "loss": 0.79116946, "num_input_tokens_seen": 80956265, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.640625, "step": 3762, "time_per_iteration": 2.386636972427368 }, { "auxiliary_loss_clip": 0.01091936, "auxiliary_loss_mlp": 0.01034725, "balance_loss_clip": 1.01662946, "balance_loss_mlp": 1.02746713, "epoch": 0.22624379978956863, "flos": 20447232192000.0, "grad_norm": 2.0975437539423507, "language_loss": 0.78804028, "learning_rate": 3.5160783982073595e-06, "loss": 0.80930692, "num_input_tokens_seen": 80975185, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.64453125, "step": 3763, "time_per_iteration": 2.3685462474823 }, { "auxiliary_loss_clip": 0.01093227, "auxiliary_loss_mlp": 0.01038478, "balance_loss_clip": 1.01939213, "balance_loss_mlp": 1.02874374, "epoch": 0.2263039230422366, "flos": 17493335591040.0, "grad_norm": 1.612111878829377, "language_loss": 0.9122529, "learning_rate": 3.5158319520359703e-06, "loss": 0.93356991, "num_input_tokens_seen": 80992830, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.64453125, "step": 3764, "time_per_iteration": 2.3483502864837646 }, { "auxiliary_loss_clip": 0.01090226, "auxiliary_loss_mlp": 0.01038333, "balance_loss_clip": 1.02058291, "balance_loss_mlp": 1.0291301, "epoch": 0.22636404629490456, "flos": 28328490794880.0, "grad_norm": 1.8936628916638818, "language_loss": 0.75164557, "learning_rate": 3.515585451768438e-06, "loss": 0.7729311, "num_input_tokens_seen": 81013675, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.609375, "step": 3765, "time_per_iteration": 2.42386794090271 }, { "auxiliary_loss_clip": 0.01089541, "auxiliary_loss_mlp": 0.01037407, "balance_loss_clip": 1.01898909, "balance_loss_mlp": 1.02846622, "epoch": 0.22642416954757252, "flos": 17488308355200.0, "grad_norm": 2.0950946375948987, "language_loss": 0.89427751, "learning_rate": 3.51533889741356e-06, "loss": 0.91554701, "num_input_tokens_seen": 81030345, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.609375, "step": 3766, "time_per_iteration": 3.7598423957824707 }, { "auxiliary_loss_clip": 0.01088469, "auxiliary_loss_mlp": 0.01033004, "balance_loss_clip": 1.01396644, "balance_loss_mlp": 1.02780557, "epoch": 0.2264842928002405, "flos": 24383515029120.0, "grad_norm": 1.5057994214588577, "language_loss": 0.74437904, "learning_rate": 3.515092288980135e-06, "loss": 0.76559377, "num_input_tokens_seen": 81051000, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.60546875, "step": 3767, "time_per_iteration": 2.406935453414917 }, { "auxiliary_loss_clip": 0.01088493, "auxiliary_loss_mlp": 0.01034417, "balance_loss_clip": 1.01472366, "balance_loss_mlp": 1.02614951, "epoch": 0.22654441605290845, "flos": 19829443950720.0, "grad_norm": 1.371887647497329, "language_loss": 0.71427721, "learning_rate": 3.5148456264769625e-06, "loss": 0.7355063, "num_input_tokens_seen": 81071205, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.625, "step": 3768, "time_per_iteration": 2.3927323818206787 }, { "auxiliary_loss_clip": 0.01093473, "auxiliary_loss_mlp": 0.01042629, "balance_loss_clip": 1.02324569, "balance_loss_mlp": 1.03060412, "epoch": 0.22660453930557642, "flos": 27453322944000.0, "grad_norm": 1.9943603153548417, "language_loss": 0.78653377, "learning_rate": 3.5145989099128465e-06, "loss": 0.80789483, "num_input_tokens_seen": 81091880, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.62890625, "step": 3769, "time_per_iteration": 3.8303472995758057 }, { "auxiliary_loss_clip": 0.01092521, "auxiliary_loss_mlp": 0.01037786, "balance_loss_clip": 1.0190109, "balance_loss_mlp": 1.02721548, "epoch": 0.2266646625582444, "flos": 23987006184960.0, "grad_norm": 1.8630213958713637, "language_loss": 0.68793172, "learning_rate": 3.5143521392965914e-06, "loss": 0.70923483, "num_input_tokens_seen": 81113290, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.65234375, "step": 3770, "time_per_iteration": 2.429872751235962 }, { "auxiliary_loss_clip": 0.01091515, "auxiliary_loss_mlp": 0.01035967, "balance_loss_clip": 1.01748979, "balance_loss_mlp": 1.02674448, "epoch": 0.22672478581091238, "flos": 26026946547840.0, "grad_norm": 1.4792494688847235, "language_loss": 0.80242562, "learning_rate": 3.5141053146370047e-06, "loss": 0.82370043, "num_input_tokens_seen": 81133535, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.6484375, "step": 3771, "time_per_iteration": 3.772441864013672 }, { "auxiliary_loss_clip": 0.01087493, "auxiliary_loss_mlp": 0.01044862, "balance_loss_clip": 1.02613378, "balance_loss_mlp": 1.02693558, "epoch": 0.22678490906358034, "flos": 23840685210240.0, "grad_norm": 1.475174598611432, "language_loss": 0.78800523, "learning_rate": 3.513858435942893e-06, "loss": 0.80932879, "num_input_tokens_seen": 81154650, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.60546875, "step": 3772, "time_per_iteration": 2.4286961555480957 }, { "auxiliary_loss_clip": 0.01017631, "auxiliary_loss_mlp": 0.01005329, "balance_loss_clip": 1.00324333, "balance_loss_mlp": 1.00297904, "epoch": 0.2268450323162483, "flos": 65044409351040.0, "grad_norm": 0.6507708474992, "language_loss": 0.54394597, "learning_rate": 3.5136115032230683e-06, "loss": 0.56417555, "num_input_tokens_seen": 81221240, "router_z_loss_clip": 0.02087402, "router_z_loss_mlp": 0.14648438, "step": 3773, "time_per_iteration": 3.0998849868774414 }, { "auxiliary_loss_clip": 0.0108632, "auxiliary_loss_mlp": 0.01035357, "balance_loss_clip": 1.01757145, "balance_loss_mlp": 1.0259006, "epoch": 0.22690515556891627, "flos": 22525053246720.0, "grad_norm": 1.9145992280613224, "language_loss": 0.70580399, "learning_rate": 3.5133645164863427e-06, "loss": 0.7270208, "num_input_tokens_seen": 81241520, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.6015625, "step": 3774, "time_per_iteration": 2.4120614528656006 }, { "auxiliary_loss_clip": 0.01086498, "auxiliary_loss_mlp": 0.01039357, "balance_loss_clip": 1.02208352, "balance_loss_mlp": 1.02557778, "epoch": 0.22696527882158424, "flos": 18222461758080.0, "grad_norm": 2.097856629454698, "language_loss": 0.74524856, "learning_rate": 3.5131174757415298e-06, "loss": 0.76650709, "num_input_tokens_seen": 81256825, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.609375, "step": 3775, "time_per_iteration": 3.832169532775879 }, { "auxiliary_loss_clip": 0.01088387, "auxiliary_loss_mlp": 0.01034693, "balance_loss_clip": 1.01764596, "balance_loss_mlp": 1.02649021, "epoch": 0.22702540207425223, "flos": 17018307365760.0, "grad_norm": 1.7580701183983654, "language_loss": 0.82575047, "learning_rate": 3.512870380997446e-06, "loss": 0.84698129, "num_input_tokens_seen": 81275695, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.62109375, "step": 3776, "time_per_iteration": 2.366807460784912 }, { "auxiliary_loss_clip": 0.01090168, "auxiliary_loss_mlp": 0.01037382, "balance_loss_clip": 1.01935768, "balance_loss_mlp": 1.02692699, "epoch": 0.2270855253269202, "flos": 21324634369920.0, "grad_norm": 3.686259464893189, "language_loss": 0.83219683, "learning_rate": 3.5126232322629114e-06, "loss": 0.85347235, "num_input_tokens_seen": 81294920, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.6328125, "step": 3777, "time_per_iteration": 2.394103765487671 }, { "auxiliary_loss_clip": 0.01090977, "auxiliary_loss_mlp": 0.01036228, "balance_loss_clip": 1.01834691, "balance_loss_mlp": 1.02901387, "epoch": 0.22714564857958816, "flos": 23549334981120.0, "grad_norm": 2.746636482275372, "language_loss": 0.72792417, "learning_rate": 3.5123760295467435e-06, "loss": 0.74919617, "num_input_tokens_seen": 81314275, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.62109375, "step": 3778, "time_per_iteration": 2.4154746532440186 }, { "auxiliary_loss_clip": 0.01088858, "auxiliary_loss_mlp": 0.01036751, "balance_loss_clip": 1.01973939, "balance_loss_mlp": 1.0261116, "epoch": 0.22720577183225613, "flos": 25988821476480.0, "grad_norm": 3.9146729769934834, "language_loss": 0.64117897, "learning_rate": 3.5121287728577657e-06, "loss": 0.66243505, "num_input_tokens_seen": 81333890, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.625, "step": 3779, "time_per_iteration": 2.416792154312134 }, { "auxiliary_loss_clip": 0.01088829, "auxiliary_loss_mlp": 0.01033635, "balance_loss_clip": 1.01594448, "balance_loss_mlp": 1.02821076, "epoch": 0.2272658950849241, "flos": 20813017173120.0, "grad_norm": 1.5792572457746858, "language_loss": 0.70214581, "learning_rate": 3.5118814622048012e-06, "loss": 0.72337043, "num_input_tokens_seen": 81353640, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.609375, "step": 3780, "time_per_iteration": 2.3873915672302246 }, { "auxiliary_loss_clip": 0.01089188, "auxiliary_loss_mlp": 0.0103907, "balance_loss_clip": 1.02037811, "balance_loss_mlp": 1.02833617, "epoch": 0.22732601833759206, "flos": 23908347158400.0, "grad_norm": 1.6702840634077138, "language_loss": 0.89330554, "learning_rate": 3.5116340975966766e-06, "loss": 0.91458809, "num_input_tokens_seen": 81371595, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.609375, "step": 3781, "time_per_iteration": 2.38491153717041 }, { "auxiliary_loss_clip": 0.01088734, "auxiliary_loss_mlp": 0.01035524, "balance_loss_clip": 1.01687992, "balance_loss_mlp": 1.02617013, "epoch": 0.22738614159026002, "flos": 15923500951680.0, "grad_norm": 1.9916366175952125, "language_loss": 0.74606478, "learning_rate": 3.5113866790422195e-06, "loss": 0.7673074, "num_input_tokens_seen": 81388435, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.625, "step": 3782, "time_per_iteration": 2.357717514038086 }, { "auxiliary_loss_clip": 0.01087326, "auxiliary_loss_mlp": 0.01033593, "balance_loss_clip": 1.01683199, "balance_loss_mlp": 1.02649641, "epoch": 0.22744626484292801, "flos": 24204410421120.0, "grad_norm": 1.4565571588063595, "language_loss": 0.82687902, "learning_rate": 3.51113920655026e-06, "loss": 0.84808826, "num_input_tokens_seen": 81410195, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.609375, "step": 3783, "time_per_iteration": 2.4019362926483154 }, { "auxiliary_loss_clip": 0.01088729, "auxiliary_loss_mlp": 0.01037909, "balance_loss_clip": 1.01919305, "balance_loss_mlp": 1.02750099, "epoch": 0.22750638809559598, "flos": 24790427458560.0, "grad_norm": 1.8598769008253748, "language_loss": 0.76036566, "learning_rate": 3.510891680129629e-06, "loss": 0.78163207, "num_input_tokens_seen": 81430060, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.61328125, "step": 3784, "time_per_iteration": 2.408007860183716 }, { "auxiliary_loss_clip": 0.01085311, "auxiliary_loss_mlp": 0.0103358, "balance_loss_clip": 1.0153892, "balance_loss_mlp": 1.02449834, "epoch": 0.22756651134826394, "flos": 22235413674240.0, "grad_norm": 1.684686076380219, "language_loss": 0.7122314, "learning_rate": 3.51064409978916e-06, "loss": 0.73342031, "num_input_tokens_seen": 81447375, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.609375, "step": 3785, "time_per_iteration": 2.3783719539642334 }, { "auxiliary_loss_clip": 0.0101713, "auxiliary_loss_mlp": 0.01009888, "balance_loss_clip": 1.00788498, "balance_loss_mlp": 1.00312281, "epoch": 0.2276266346009319, "flos": 62704006894080.0, "grad_norm": 0.8252731897482487, "language_loss": 0.61938071, "learning_rate": 3.5103964655376894e-06, "loss": 0.63965088, "num_input_tokens_seen": 81505235, "router_z_loss_clip": 0.02001953, "router_z_loss_mlp": 0.140625, "step": 3786, "time_per_iteration": 2.984421730041504 }, { "auxiliary_loss_clip": 0.01094468, "auxiliary_loss_mlp": 0.01037413, "balance_loss_clip": 1.01802897, "balance_loss_mlp": 1.02929771, "epoch": 0.22768675785359987, "flos": 18613245139200.0, "grad_norm": 2.307840027034818, "language_loss": 0.86449611, "learning_rate": 3.510148777384054e-06, "loss": 0.88581491, "num_input_tokens_seen": 81518685, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.65234375, "step": 3787, "time_per_iteration": 2.3531198501586914 }, { "auxiliary_loss_clip": 0.01085635, "auxiliary_loss_mlp": 0.01033202, "balance_loss_clip": 1.01651263, "balance_loss_mlp": 1.02698684, "epoch": 0.22774688110626784, "flos": 26868981651840.0, "grad_norm": 1.231732535080988, "language_loss": 0.72669089, "learning_rate": 3.5099010353370934e-06, "loss": 0.74787927, "num_input_tokens_seen": 81538940, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.5859375, "step": 3788, "time_per_iteration": 2.4292914867401123 }, { "auxiliary_loss_clip": 0.01087532, "auxiliary_loss_mlp": 0.01029437, "balance_loss_clip": 1.01235449, "balance_loss_mlp": 1.02739811, "epoch": 0.2278070043589358, "flos": 15552863291520.0, "grad_norm": 2.5254162728632545, "language_loss": 0.67666602, "learning_rate": 3.5096532394056487e-06, "loss": 0.69783568, "num_input_tokens_seen": 81555525, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.6015625, "step": 3789, "time_per_iteration": 2.3270905017852783 }, { "auxiliary_loss_clip": 0.01087963, "auxiliary_loss_mlp": 0.01039137, "balance_loss_clip": 1.02088594, "balance_loss_mlp": 1.02717233, "epoch": 0.2278671276116038, "flos": 22415775091200.0, "grad_norm": 1.8408681320616835, "language_loss": 0.75489384, "learning_rate": 3.5094053895985632e-06, "loss": 0.77616483, "num_input_tokens_seen": 81576305, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.609375, "step": 3790, "time_per_iteration": 2.4211106300354004 }, { "auxiliary_loss_clip": 0.01085204, "auxiliary_loss_mlp": 0.01031517, "balance_loss_clip": 1.01396954, "balance_loss_mlp": 1.02503514, "epoch": 0.22792725086427176, "flos": 20630316695040.0, "grad_norm": 1.87915916690466, "language_loss": 0.90536761, "learning_rate": 3.5091574859246818e-06, "loss": 0.92653483, "num_input_tokens_seen": 81594115, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.6015625, "step": 3791, "time_per_iteration": 2.3681387901306152 }, { "auxiliary_loss_clip": 0.01087979, "auxiliary_loss_mlp": 0.01035071, "balance_loss_clip": 1.01642704, "balance_loss_mlp": 1.02538478, "epoch": 0.22798737411693973, "flos": 31427661029760.0, "grad_norm": 2.0857950402727568, "language_loss": 0.82091236, "learning_rate": 3.508909528392852e-06, "loss": 0.84214282, "num_input_tokens_seen": 81615355, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.625, "step": 3792, "time_per_iteration": 2.462547779083252 }, { "auxiliary_loss_clip": 0.01016086, "auxiliary_loss_mlp": 0.01011285, "balance_loss_clip": 1.00915098, "balance_loss_mlp": 1.00226247, "epoch": 0.2280474973696077, "flos": 52394121095040.0, "grad_norm": 1.1432011257847694, "language_loss": 0.65706909, "learning_rate": 3.5086615170119224e-06, "loss": 0.67734277, "num_input_tokens_seen": 81662075, "router_z_loss_clip": 0.0213623, "router_z_loss_mlp": 0.13867188, "step": 3793, "time_per_iteration": 2.7637650966644287 }, { "auxiliary_loss_clip": 0.0109298, "auxiliary_loss_mlp": 0.01042921, "balance_loss_clip": 1.02335882, "balance_loss_mlp": 1.02766323, "epoch": 0.22810762062227566, "flos": 26394861121920.0, "grad_norm": 2.602653448535137, "language_loss": 0.76358742, "learning_rate": 3.508413451790744e-06, "loss": 0.78494644, "num_input_tokens_seen": 81681625, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.65234375, "step": 3794, "time_per_iteration": 2.426140069961548 }, { "auxiliary_loss_clip": 0.01088427, "auxiliary_loss_mlp": 0.01034406, "balance_loss_clip": 1.01600027, "balance_loss_mlp": 1.02610791, "epoch": 0.22816774387494362, "flos": 25629076160640.0, "grad_norm": 1.725615627529767, "language_loss": 0.80870014, "learning_rate": 3.50816533273817e-06, "loss": 0.82992846, "num_input_tokens_seen": 81701170, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.625, "step": 3795, "time_per_iteration": 2.411367654800415 }, { "auxiliary_loss_clip": 0.01086954, "auxiliary_loss_mlp": 0.01039503, "balance_loss_clip": 1.02106178, "balance_loss_mlp": 1.02629185, "epoch": 0.22822786712761162, "flos": 22450618494720.0, "grad_norm": 1.6378856943664284, "language_loss": 0.76962423, "learning_rate": 3.507917159863054e-06, "loss": 0.79088885, "num_input_tokens_seen": 81721265, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.60546875, "step": 3796, "time_per_iteration": 2.4178855419158936 }, { "auxiliary_loss_clip": 0.01086449, "auxiliary_loss_mlp": 0.01032688, "balance_loss_clip": 1.01664233, "balance_loss_mlp": 1.02601695, "epoch": 0.22828799038027958, "flos": 12201759619200.0, "grad_norm": 2.2935202414850733, "language_loss": 0.95839965, "learning_rate": 3.507668933174254e-06, "loss": 0.97959107, "num_input_tokens_seen": 81736565, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.60546875, "step": 3797, "time_per_iteration": 2.3302457332611084 }, { "auxiliary_loss_clip": 0.01088704, "auxiliary_loss_mlp": 0.01037217, "balance_loss_clip": 1.0200386, "balance_loss_mlp": 1.02784896, "epoch": 0.22834811363294755, "flos": 22084763690880.0, "grad_norm": 1.497063928582699, "language_loss": 0.81556934, "learning_rate": 3.5074206526806274e-06, "loss": 0.83682853, "num_input_tokens_seen": 81756240, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.609375, "step": 3798, "time_per_iteration": 2.421319007873535 }, { "auxiliary_loss_clip": 0.01086642, "auxiliary_loss_mlp": 0.01034656, "balance_loss_clip": 1.01555872, "balance_loss_mlp": 1.02524805, "epoch": 0.2284082368856155, "flos": 24859555683840.0, "grad_norm": 1.9221700156500818, "language_loss": 0.79203105, "learning_rate": 3.507172318391036e-06, "loss": 0.8132441, "num_input_tokens_seen": 81775720, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.61328125, "step": 3799, "time_per_iteration": 2.40585994720459 }, { "auxiliary_loss_clip": 0.01087693, "auxiliary_loss_mlp": 0.01032413, "balance_loss_clip": 1.01581931, "balance_loss_mlp": 1.02675533, "epoch": 0.22846836013828348, "flos": 23291815726080.0, "grad_norm": 1.4823114103646389, "language_loss": 0.74984872, "learning_rate": 3.506923930314341e-06, "loss": 0.77104974, "num_input_tokens_seen": 81795830, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.609375, "step": 3800, "time_per_iteration": 2.3957200050354004 }, { "auxiliary_loss_clip": 0.01088192, "auxiliary_loss_mlp": 0.01038338, "balance_loss_clip": 1.0217557, "balance_loss_mlp": 1.0274626, "epoch": 0.22852848339095144, "flos": 27415093138560.0, "grad_norm": 1.7382232068920265, "language_loss": 0.64025426, "learning_rate": 3.5066754884594072e-06, "loss": 0.66151953, "num_input_tokens_seen": 81815745, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.609375, "step": 3801, "time_per_iteration": 2.4217402935028076 }, { "auxiliary_loss_clip": 0.01084292, "auxiliary_loss_mlp": 0.01028622, "balance_loss_clip": 1.0131011, "balance_loss_mlp": 1.02608454, "epoch": 0.2285886066436194, "flos": 26320007433600.0, "grad_norm": 1.5845091426353741, "language_loss": 0.81693745, "learning_rate": 3.5064269928351005e-06, "loss": 0.83806658, "num_input_tokens_seen": 81835155, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.58203125, "step": 3802, "time_per_iteration": 2.4141721725463867 }, { "auxiliary_loss_clip": 0.01089418, "auxiliary_loss_mlp": 0.01041514, "balance_loss_clip": 1.02356124, "balance_loss_mlp": 1.02765429, "epoch": 0.2286487298962874, "flos": 29715171108480.0, "grad_norm": 1.7399230486354544, "language_loss": 0.78634125, "learning_rate": 3.5061784434502897e-06, "loss": 0.80765057, "num_input_tokens_seen": 81855655, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.6171875, "step": 3803, "time_per_iteration": 2.431398391723633 }, { "auxiliary_loss_clip": 0.01086104, "auxiliary_loss_mlp": 0.01035511, "balance_loss_clip": 1.0178566, "balance_loss_mlp": 1.02444887, "epoch": 0.22870885314895537, "flos": 21286160184960.0, "grad_norm": 1.7096143204037866, "language_loss": 0.85129672, "learning_rate": 3.505929840313845e-06, "loss": 0.87251282, "num_input_tokens_seen": 81876385, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.6171875, "step": 3804, "time_per_iteration": 2.4004709720611572 }, { "auxiliary_loss_clip": 0.0109045, "auxiliary_loss_mlp": 0.01037, "balance_loss_clip": 1.01934481, "balance_loss_mlp": 1.02662039, "epoch": 0.22876897640162333, "flos": 14938566186240.0, "grad_norm": 1.8684811704061102, "language_loss": 0.76703346, "learning_rate": 3.5056811834346382e-06, "loss": 0.78830791, "num_input_tokens_seen": 81893225, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.640625, "step": 3805, "time_per_iteration": 2.343780040740967 }, { "auxiliary_loss_clip": 0.01089323, "auxiliary_loss_mlp": 0.0103756, "balance_loss_clip": 1.01934469, "balance_loss_mlp": 1.02592516, "epoch": 0.2288290996542913, "flos": 18112939223040.0, "grad_norm": 2.326606822596895, "language_loss": 0.78419352, "learning_rate": 3.5054324728215423e-06, "loss": 0.80546236, "num_input_tokens_seen": 81911350, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.6328125, "step": 3806, "time_per_iteration": 3.7545509338378906 }, { "auxiliary_loss_clip": 0.01088861, "auxiliary_loss_mlp": 0.0104011, "balance_loss_clip": 1.02324224, "balance_loss_mlp": 1.02702498, "epoch": 0.22888922290695926, "flos": 39853983778560.0, "grad_norm": 3.2673323541602923, "language_loss": 0.70302856, "learning_rate": 3.505183708483434e-06, "loss": 0.72431827, "num_input_tokens_seen": 81935420, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.6171875, "step": 3807, "time_per_iteration": 2.5256102085113525 }, { "auxiliary_loss_clip": 0.01094187, "auxiliary_loss_mlp": 0.01042855, "balance_loss_clip": 1.02417517, "balance_loss_mlp": 1.02921653, "epoch": 0.22894934615962723, "flos": 23402664892800.0, "grad_norm": 2.0048278010572735, "language_loss": 0.65318346, "learning_rate": 3.504934890429191e-06, "loss": 0.67455387, "num_input_tokens_seen": 81953845, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.6484375, "step": 3808, "time_per_iteration": 2.4050979614257812 }, { "auxiliary_loss_clip": 0.01089716, "auxiliary_loss_mlp": 0.01041493, "balance_loss_clip": 1.02494645, "balance_loss_mlp": 1.02788758, "epoch": 0.22900946941229522, "flos": 18842030478720.0, "grad_norm": 1.9090335901439184, "language_loss": 0.75185037, "learning_rate": 3.5046860186676936e-06, "loss": 0.77316242, "num_input_tokens_seen": 81972100, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.6171875, "step": 3809, "time_per_iteration": 3.7812273502349854 }, { "auxiliary_loss_clip": 0.01087151, "auxiliary_loss_mlp": 0.01036026, "balance_loss_clip": 1.01870501, "balance_loss_mlp": 1.02752638, "epoch": 0.22906959266496318, "flos": 22928299983360.0, "grad_norm": 1.434623301717465, "language_loss": 0.81609118, "learning_rate": 3.504437093207822e-06, "loss": 0.83732295, "num_input_tokens_seen": 81992760, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.59375, "step": 3810, "time_per_iteration": 3.8533847332000732 }, { "auxiliary_loss_clip": 0.01087667, "auxiliary_loss_mlp": 0.01033409, "balance_loss_clip": 1.01723254, "balance_loss_mlp": 1.02792406, "epoch": 0.22912971591763115, "flos": 19353508030080.0, "grad_norm": 2.011777331327998, "language_loss": 0.7841962, "learning_rate": 3.5041881140584602e-06, "loss": 0.80540693, "num_input_tokens_seen": 82009080, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.59765625, "step": 3811, "time_per_iteration": 2.3599648475646973 }, { "auxiliary_loss_clip": 0.01087382, "auxiliary_loss_mlp": 0.01036212, "balance_loss_clip": 1.01943886, "balance_loss_mlp": 1.02636838, "epoch": 0.22918983917029911, "flos": 19932647529600.0, "grad_norm": 1.8143441437452754, "language_loss": 0.83240467, "learning_rate": 3.5039390812284937e-06, "loss": 0.85364068, "num_input_tokens_seen": 82026705, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.609375, "step": 3812, "time_per_iteration": 2.364509105682373 }, { "auxiliary_loss_clip": 0.01092594, "auxiliary_loss_mlp": 0.01038553, "balance_loss_clip": 1.0198257, "balance_loss_mlp": 1.02847826, "epoch": 0.22924996242296708, "flos": 16689949228800.0, "grad_norm": 2.681985172712628, "language_loss": 0.83799887, "learning_rate": 3.5036899947268105e-06, "loss": 0.85931039, "num_input_tokens_seen": 82043245, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.640625, "step": 3813, "time_per_iteration": 2.3506107330322266 }, { "auxiliary_loss_clip": 0.01087346, "auxiliary_loss_mlp": 0.01031595, "balance_loss_clip": 1.01410723, "balance_loss_mlp": 1.02632737, "epoch": 0.22931008567563504, "flos": 33034782867840.0, "grad_norm": 1.673959294344394, "language_loss": 0.70345366, "learning_rate": 3.5034408545623e-06, "loss": 0.72464311, "num_input_tokens_seen": 82066870, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.609375, "step": 3814, "time_per_iteration": 3.8504204750061035 }, { "auxiliary_loss_clip": 0.01084833, "auxiliary_loss_mlp": 0.01035658, "balance_loss_clip": 1.01902843, "balance_loss_mlp": 1.02492142, "epoch": 0.229370208928303, "flos": 23329591683840.0, "grad_norm": 2.337415803708085, "language_loss": 0.66801226, "learning_rate": 3.5031916607438516e-06, "loss": 0.68921727, "num_input_tokens_seen": 82083180, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.59765625, "step": 3815, "time_per_iteration": 2.3961167335510254 }, { "auxiliary_loss_clip": 0.01089226, "auxiliary_loss_mlp": 0.01045217, "balance_loss_clip": 1.02741861, "balance_loss_mlp": 1.02737093, "epoch": 0.229430332180971, "flos": 28616070597120.0, "grad_norm": 1.7927608595653053, "language_loss": 0.83907104, "learning_rate": 3.50294241328036e-06, "loss": 0.86041558, "num_input_tokens_seen": 82102950, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.6171875, "step": 3816, "time_per_iteration": 2.436598539352417 }, { "auxiliary_loss_clip": 0.01088234, "auxiliary_loss_mlp": 0.010322, "balance_loss_clip": 1.01387787, "balance_loss_mlp": 1.02649224, "epoch": 0.22949045543363897, "flos": 17237247701760.0, "grad_norm": 2.5401060833844453, "language_loss": 0.8700307, "learning_rate": 3.5026931121807195e-06, "loss": 0.89123505, "num_input_tokens_seen": 82119510, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.6171875, "step": 3817, "time_per_iteration": 2.340454578399658 }, { "auxiliary_loss_clip": 0.01088736, "auxiliary_loss_mlp": 0.01035284, "balance_loss_clip": 1.0173552, "balance_loss_mlp": 1.02608013, "epoch": 0.22955057868630693, "flos": 27488236170240.0, "grad_norm": 1.7309962909696994, "language_loss": 0.75099266, "learning_rate": 3.5024437574538275e-06, "loss": 0.77223289, "num_input_tokens_seen": 82140095, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.625, "step": 3818, "time_per_iteration": 2.4216606616973877 }, { "auxiliary_loss_clip": 0.01088797, "auxiliary_loss_mlp": 0.01029978, "balance_loss_clip": 1.013134, "balance_loss_mlp": 1.02624011, "epoch": 0.2296107019389749, "flos": 23475319165440.0, "grad_norm": 1.5652278066751535, "language_loss": 0.7429148, "learning_rate": 3.5021943491085823e-06, "loss": 0.76410252, "num_input_tokens_seen": 82159510, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.625, "step": 3819, "time_per_iteration": 2.4085023403167725 }, { "auxiliary_loss_clip": 0.01088546, "auxiliary_loss_mlp": 0.01034126, "balance_loss_clip": 1.01664972, "balance_loss_mlp": 1.02787375, "epoch": 0.22967082519164286, "flos": 31283818761600.0, "grad_norm": 1.8277362002645299, "language_loss": 0.81004488, "learning_rate": 3.5019448871538853e-06, "loss": 0.83127153, "num_input_tokens_seen": 82179580, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.609375, "step": 3820, "time_per_iteration": 2.443455219268799 }, { "auxiliary_loss_clip": 0.01090382, "auxiliary_loss_mlp": 0.01036679, "balance_loss_clip": 1.01803446, "balance_loss_mlp": 1.02673411, "epoch": 0.22973094844431083, "flos": 14642188721280.0, "grad_norm": 1.8653661172565414, "language_loss": 0.69101381, "learning_rate": 3.501695371598638e-06, "loss": 0.71228445, "num_input_tokens_seen": 82195585, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.63671875, "step": 3821, "time_per_iteration": 2.3479127883911133 }, { "auxiliary_loss_clip": 0.01087111, "auxiliary_loss_mlp": 0.01032182, "balance_loss_clip": 1.01444376, "balance_loss_mlp": 1.02677619, "epoch": 0.2297910716969788, "flos": 22822652609280.0, "grad_norm": 1.6122764700202343, "language_loss": 0.82934833, "learning_rate": 3.501445802451746e-06, "loss": 0.85054123, "num_input_tokens_seen": 82217530, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.6015625, "step": 3822, "time_per_iteration": 2.4323556423187256 }, { "auxiliary_loss_clip": 0.01085935, "auxiliary_loss_mlp": 0.01031304, "balance_loss_clip": 1.01349413, "balance_loss_mlp": 1.02483392, "epoch": 0.2298511949496468, "flos": 23037927252480.0, "grad_norm": 1.6174899692367815, "language_loss": 0.6632266, "learning_rate": 3.5011961797221158e-06, "loss": 0.68439901, "num_input_tokens_seen": 82237980, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.61328125, "step": 3823, "time_per_iteration": 2.385617971420288 }, { "auxiliary_loss_clip": 0.01018244, "auxiliary_loss_mlp": 0.0100774, "balance_loss_clip": 1.00547528, "balance_loss_mlp": 1.00400758, "epoch": 0.22991131820231475, "flos": 66887684691840.0, "grad_norm": 0.8083559781092068, "language_loss": 0.56830817, "learning_rate": 3.5009465034186554e-06, "loss": 0.58856803, "num_input_tokens_seen": 82301785, "router_z_loss_clip": 0.02270508, "router_z_loss_mlp": 0.14257812, "step": 3824, "time_per_iteration": 3.1239378452301025 }, { "auxiliary_loss_clip": 0.01084447, "auxiliary_loss_mlp": 0.01033754, "balance_loss_clip": 1.01605201, "balance_loss_mlp": 1.02550197, "epoch": 0.22997144145498272, "flos": 17886492944640.0, "grad_norm": 3.065801308947649, "language_loss": 0.73238909, "learning_rate": 3.500696773550275e-06, "loss": 0.75357115, "num_input_tokens_seen": 82317355, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.58984375, "step": 3825, "time_per_iteration": 2.321202516555786 }, { "auxiliary_loss_clip": 0.01091978, "auxiliary_loss_mlp": 0.01038205, "balance_loss_clip": 1.0186305, "balance_loss_mlp": 1.0295465, "epoch": 0.23003156470765068, "flos": 24675807864960.0, "grad_norm": 1.7841718054769142, "language_loss": 0.8789047, "learning_rate": 3.5004469901258873e-06, "loss": 0.90020657, "num_input_tokens_seen": 82336645, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.625, "step": 3826, "time_per_iteration": 2.4327785968780518 }, { "auxiliary_loss_clip": 0.01089859, "auxiliary_loss_mlp": 0.01041345, "balance_loss_clip": 1.02131748, "balance_loss_mlp": 1.02550173, "epoch": 0.23009168796031865, "flos": 15813245278080.0, "grad_norm": 2.330247921206323, "language_loss": 0.81820428, "learning_rate": 3.5001971531544053e-06, "loss": 0.83951628, "num_input_tokens_seen": 82354225, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.64453125, "step": 3827, "time_per_iteration": 2.365079164505005 }, { "auxiliary_loss_clip": 0.01085776, "auxiliary_loss_mlp": 0.0103543, "balance_loss_clip": 1.01772785, "balance_loss_mlp": 1.02626896, "epoch": 0.2301518112129866, "flos": 16212023360640.0, "grad_norm": 1.9029908701929552, "language_loss": 0.86464047, "learning_rate": 3.499947262644747e-06, "loss": 0.88585246, "num_input_tokens_seen": 82370240, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.59375, "step": 3828, "time_per_iteration": 2.3360729217529297 }, { "auxiliary_loss_clip": 0.01088078, "auxiliary_loss_mlp": 0.01039298, "balance_loss_clip": 1.02073741, "balance_loss_mlp": 1.02658987, "epoch": 0.2302119344656546, "flos": 20594391039360.0, "grad_norm": 1.8951397423855496, "language_loss": 0.70642465, "learning_rate": 3.4996973186058284e-06, "loss": 0.72769845, "num_input_tokens_seen": 82389145, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.6171875, "step": 3829, "time_per_iteration": 2.386470079421997 }, { "auxiliary_loss_clip": 0.0108747, "auxiliary_loss_mlp": 0.01032478, "balance_loss_clip": 1.01601517, "balance_loss_mlp": 1.02867401, "epoch": 0.23027205771832257, "flos": 26795698974720.0, "grad_norm": 1.4732655685815619, "language_loss": 0.84161735, "learning_rate": 3.4994473210465706e-06, "loss": 0.86281681, "num_input_tokens_seen": 82409185, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.5859375, "step": 3830, "time_per_iteration": 2.4395828247070312 }, { "auxiliary_loss_clip": 0.01087966, "auxiliary_loss_mlp": 0.010417, "balance_loss_clip": 1.02379513, "balance_loss_mlp": 1.0273571, "epoch": 0.23033218097099054, "flos": 43871439260160.0, "grad_norm": 1.6509844910798381, "language_loss": 0.67261213, "learning_rate": 3.499197269975895e-06, "loss": 0.69390881, "num_input_tokens_seen": 82432070, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.60546875, "step": 3831, "time_per_iteration": 2.5767300128936768 }, { "auxiliary_loss_clip": 0.01089142, "auxiliary_loss_mlp": 0.01037978, "balance_loss_clip": 1.01898837, "balance_loss_mlp": 1.02638268, "epoch": 0.2303923042236585, "flos": 26066468073600.0, "grad_norm": 2.0001274322563645, "language_loss": 0.74681842, "learning_rate": 3.4989471654027247e-06, "loss": 0.76808959, "num_input_tokens_seen": 82450625, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.62890625, "step": 3832, "time_per_iteration": 2.4149653911590576 }, { "auxiliary_loss_clip": 0.01087654, "auxiliary_loss_mlp": 0.01038064, "balance_loss_clip": 1.01906228, "balance_loss_mlp": 1.02671075, "epoch": 0.23045242747632647, "flos": 18295395321600.0, "grad_norm": 1.6620348391588846, "language_loss": 0.87407881, "learning_rate": 3.4986970073359865e-06, "loss": 0.89533603, "num_input_tokens_seen": 82468575, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.609375, "step": 3833, "time_per_iteration": 2.358140468597412 }, { "auxiliary_loss_clip": 0.01087184, "auxiliary_loss_mlp": 0.01039239, "balance_loss_clip": 1.02097583, "balance_loss_mlp": 1.02623963, "epoch": 0.23051255072899443, "flos": 25519344157440.0, "grad_norm": 1.7262268838511203, "language_loss": 0.74829298, "learning_rate": 3.498446795784607e-06, "loss": 0.76955724, "num_input_tokens_seen": 82488655, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.609375, "step": 3834, "time_per_iteration": 2.412421941757202 }, { "auxiliary_loss_clip": 0.01087917, "auxiliary_loss_mlp": 0.0104055, "balance_loss_clip": 1.02178693, "balance_loss_mlp": 1.02836227, "epoch": 0.2305726739816624, "flos": 21214134316800.0, "grad_norm": 1.6548479875892168, "language_loss": 0.85558653, "learning_rate": 3.4981965307575153e-06, "loss": 0.87687123, "num_input_tokens_seen": 82507220, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.59375, "step": 3835, "time_per_iteration": 2.3953325748443604 }, { "auxiliary_loss_clip": 0.01094482, "auxiliary_loss_mlp": 0.01041777, "balance_loss_clip": 1.02287018, "balance_loss_mlp": 1.02779126, "epoch": 0.2306327972343304, "flos": 23330010620160.0, "grad_norm": 2.0091674081892386, "language_loss": 0.81818467, "learning_rate": 3.4979462122636436e-06, "loss": 0.83954728, "num_input_tokens_seen": 82527920, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.66796875, "step": 3836, "time_per_iteration": 2.389854907989502 }, { "auxiliary_loss_clip": 0.0109041, "auxiliary_loss_mlp": 0.01038697, "balance_loss_clip": 1.01990986, "balance_loss_mlp": 1.02915406, "epoch": 0.23069292048699835, "flos": 20665718680320.0, "grad_norm": 3.6008190810214966, "language_loss": 0.79711235, "learning_rate": 3.497695840311925e-06, "loss": 0.81840348, "num_input_tokens_seen": 82549040, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.61328125, "step": 3837, "time_per_iteration": 2.3949389457702637 }, { "auxiliary_loss_clip": 0.01018597, "auxiliary_loss_mlp": 0.01004495, "balance_loss_clip": 1.00242114, "balance_loss_mlp": 1.00478077, "epoch": 0.23075304373966632, "flos": 70451828680320.0, "grad_norm": 0.9064880113181014, "language_loss": 0.65390468, "learning_rate": 3.4974454149112943e-06, "loss": 0.67413557, "num_input_tokens_seen": 82604070, "router_z_loss_clip": 0.02075195, "router_z_loss_mlp": 0.13867188, "step": 3838, "time_per_iteration": 2.934767246246338 }, { "auxiliary_loss_clip": 0.01085654, "auxiliary_loss_mlp": 0.01036766, "balance_loss_clip": 1.01905131, "balance_loss_mlp": 1.02633429, "epoch": 0.23081316699233428, "flos": 16617050576640.0, "grad_norm": 1.8372022434081068, "language_loss": 0.75830615, "learning_rate": 3.4971949360706887e-06, "loss": 0.77953029, "num_input_tokens_seen": 82619665, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.59375, "step": 3839, "time_per_iteration": 2.355003833770752 }, { "auxiliary_loss_clip": 0.01093763, "auxiliary_loss_mlp": 0.01037886, "balance_loss_clip": 1.01964688, "balance_loss_mlp": 1.03051066, "epoch": 0.23087329024500225, "flos": 13297229349120.0, "grad_norm": 1.7005344560926294, "language_loss": 0.68687391, "learning_rate": 3.4969444037990466e-06, "loss": 0.70819044, "num_input_tokens_seen": 82637530, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.6328125, "step": 3840, "time_per_iteration": 2.3609533309936523 }, { "auxiliary_loss_clip": 0.01090251, "auxiliary_loss_mlp": 0.01030775, "balance_loss_clip": 1.01127279, "balance_loss_mlp": 1.02743244, "epoch": 0.23093341349767021, "flos": 17784755642880.0, "grad_norm": 1.9811088588370027, "language_loss": 0.79281104, "learning_rate": 3.49669381810531e-06, "loss": 0.81402135, "num_input_tokens_seen": 82656130, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.62890625, "step": 3841, "time_per_iteration": 2.3632378578186035 }, { "auxiliary_loss_clip": 0.01088904, "auxiliary_loss_mlp": 0.01034697, "balance_loss_clip": 1.01825762, "balance_loss_mlp": 1.02759445, "epoch": 0.23099353675033818, "flos": 23986936362240.0, "grad_norm": 1.742101409675352, "language_loss": 0.82951319, "learning_rate": 3.4964431789984204e-06, "loss": 0.85074925, "num_input_tokens_seen": 82675295, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.609375, "step": 3842, "time_per_iteration": 2.3822836875915527 }, { "auxiliary_loss_clip": 0.01086767, "auxiliary_loss_mlp": 0.01042601, "balance_loss_clip": 1.02311015, "balance_loss_mlp": 1.0251447, "epoch": 0.23105366000300617, "flos": 35993601970560.0, "grad_norm": 1.4223679128032825, "language_loss": 0.66324598, "learning_rate": 3.496192486487323e-06, "loss": 0.68453968, "num_input_tokens_seen": 82703260, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.6171875, "step": 3843, "time_per_iteration": 2.578294038772583 }, { "auxiliary_loss_clip": 0.01087501, "auxiliary_loss_mlp": 0.0103935, "balance_loss_clip": 1.02137303, "balance_loss_mlp": 1.02754712, "epoch": 0.23111378325567414, "flos": 31244087767680.0, "grad_norm": 1.8211004731245037, "language_loss": 0.77522135, "learning_rate": 3.495941740580965e-06, "loss": 0.79648989, "num_input_tokens_seen": 82725060, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.6015625, "step": 3844, "time_per_iteration": 2.45005202293396 }, { "auxiliary_loss_clip": 0.01089918, "auxiliary_loss_mlp": 0.01036045, "balance_loss_clip": 1.01710296, "balance_loss_mlp": 1.02741623, "epoch": 0.2311739065083421, "flos": 19207221966720.0, "grad_norm": 1.6325071452183224, "language_loss": 0.77953732, "learning_rate": 3.495690941288294e-06, "loss": 0.80079699, "num_input_tokens_seen": 82742960, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.625, "step": 3845, "time_per_iteration": 3.743535041809082 }, { "auxiliary_loss_clip": 0.01083719, "auxiliary_loss_mlp": 0.01030808, "balance_loss_clip": 1.01330805, "balance_loss_mlp": 1.02568829, "epoch": 0.23123402976101007, "flos": 23359268206080.0, "grad_norm": 2.5102203572769435, "language_loss": 0.76146102, "learning_rate": 3.495440088618261e-06, "loss": 0.78260636, "num_input_tokens_seen": 82760205, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.578125, "step": 3846, "time_per_iteration": 2.3775031566619873 }, { "auxiliary_loss_clip": 0.01086575, "auxiliary_loss_mlp": 0.01035472, "balance_loss_clip": 1.01675582, "balance_loss_mlp": 1.02657318, "epoch": 0.23129415301367803, "flos": 13734516528000.0, "grad_norm": 1.716294172686985, "language_loss": 0.69583297, "learning_rate": 3.4951891825798177e-06, "loss": 0.71705341, "num_input_tokens_seen": 82778590, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.6015625, "step": 3847, "time_per_iteration": 2.3594911098480225 }, { "auxiliary_loss_clip": 0.01016762, "auxiliary_loss_mlp": 0.0100073, "balance_loss_clip": 0.99875158, "balance_loss_mlp": 1.00283837, "epoch": 0.231354276266346, "flos": 69733699591680.0, "grad_norm": 0.7874101911604106, "language_loss": 0.61031818, "learning_rate": 3.4949382231819186e-06, "loss": 0.6304931, "num_input_tokens_seen": 82833925, "router_z_loss_clip": 0.01977539, "router_z_loss_mlp": 0.13867188, "step": 3848, "time_per_iteration": 2.929974317550659 }, { "auxiliary_loss_clip": 0.01085614, "auxiliary_loss_mlp": 0.01037029, "balance_loss_clip": 1.01906395, "balance_loss_mlp": 1.02494383, "epoch": 0.231414399519014, "flos": 18835118029440.0, "grad_norm": 2.3678579967272486, "language_loss": 0.78285599, "learning_rate": 3.4946872104335192e-06, "loss": 0.80408239, "num_input_tokens_seen": 82850625, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.609375, "step": 3849, "time_per_iteration": 3.8106307983398438 }, { "auxiliary_loss_clip": 0.01087009, "auxiliary_loss_mlp": 0.01037892, "balance_loss_clip": 1.01940298, "balance_loss_mlp": 1.02645183, "epoch": 0.23147452277168196, "flos": 36134057836800.0, "grad_norm": 1.8836628050263342, "language_loss": 0.71054161, "learning_rate": 3.4944361443435788e-06, "loss": 0.73179066, "num_input_tokens_seen": 82872105, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.60546875, "step": 3850, "time_per_iteration": 3.8503291606903076 }, { "auxiliary_loss_clip": 0.01085889, "auxiliary_loss_mlp": 0.01032545, "balance_loss_clip": 1.01435411, "balance_loss_mlp": 1.02487469, "epoch": 0.23153464602434992, "flos": 20811900009600.0, "grad_norm": 1.6771053465701393, "language_loss": 0.76121211, "learning_rate": 3.4941850249210562e-06, "loss": 0.7823965, "num_input_tokens_seen": 82890595, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.609375, "step": 3851, "time_per_iteration": 2.372323513031006 }, { "auxiliary_loss_clip": 0.01085647, "auxiliary_loss_mlp": 0.01032212, "balance_loss_clip": 1.01492655, "balance_loss_mlp": 1.02648795, "epoch": 0.2315947692770179, "flos": 19938198435840.0, "grad_norm": 1.6970532262689262, "language_loss": 0.69931245, "learning_rate": 3.4939338521749137e-06, "loss": 0.72049105, "num_input_tokens_seen": 82908910, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.59375, "step": 3852, "time_per_iteration": 2.37731671333313 }, { "auxiliary_loss_clip": 0.01088036, "auxiliary_loss_mlp": 0.01034083, "balance_loss_clip": 1.01686323, "balance_loss_mlp": 1.02657413, "epoch": 0.23165489252968585, "flos": 12854845111680.0, "grad_norm": 2.373656272034946, "language_loss": 0.67155361, "learning_rate": 3.493682626114115e-06, "loss": 0.69277483, "num_input_tokens_seen": 82925405, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.6171875, "step": 3853, "time_per_iteration": 2.344114303588867 }, { "auxiliary_loss_clip": 0.01087381, "auxiliary_loss_mlp": 0.01034954, "balance_loss_clip": 1.01583314, "balance_loss_mlp": 1.02618957, "epoch": 0.23171501578235382, "flos": 30626962842240.0, "grad_norm": 1.6026857997278943, "language_loss": 0.79938722, "learning_rate": 3.4934313467476255e-06, "loss": 0.82061064, "num_input_tokens_seen": 82945615, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.61328125, "step": 3854, "time_per_iteration": 3.821682929992676 }, { "auxiliary_loss_clip": 0.01090967, "auxiliary_loss_mlp": 0.01037838, "balance_loss_clip": 1.01839483, "balance_loss_mlp": 1.02631581, "epoch": 0.23177513903502178, "flos": 23841627816960.0, "grad_norm": 2.244247077098742, "language_loss": 0.65230674, "learning_rate": 3.4931800140844123e-06, "loss": 0.67359477, "num_input_tokens_seen": 82967570, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.6484375, "step": 3855, "time_per_iteration": 2.4144818782806396 }, { "auxiliary_loss_clip": 0.01087764, "auxiliary_loss_mlp": 0.01040759, "balance_loss_clip": 1.02191257, "balance_loss_mlp": 1.02582347, "epoch": 0.23183526228768978, "flos": 29568989779200.0, "grad_norm": 2.3325060406619604, "language_loss": 0.70744312, "learning_rate": 3.4929286281334455e-06, "loss": 0.72872829, "num_input_tokens_seen": 82987435, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.6171875, "step": 3856, "time_per_iteration": 2.4286093711853027 }, { "auxiliary_loss_clip": 0.01086147, "auxiliary_loss_mlp": 0.01037107, "balance_loss_clip": 1.02171731, "balance_loss_mlp": 1.02714205, "epoch": 0.23189538554035774, "flos": 34457284103040.0, "grad_norm": 1.524265996803013, "language_loss": 0.7678349, "learning_rate": 3.4926771889036964e-06, "loss": 0.78906745, "num_input_tokens_seen": 83010505, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.58984375, "step": 3857, "time_per_iteration": 2.5084476470947266 }, { "auxiliary_loss_clip": 0.01091045, "auxiliary_loss_mlp": 0.01043371, "balance_loss_clip": 1.02281952, "balance_loss_mlp": 1.02686715, "epoch": 0.2319555087930257, "flos": 18002858106240.0, "grad_norm": 2.153866176548299, "language_loss": 0.91147965, "learning_rate": 3.4924256964041387e-06, "loss": 0.93282378, "num_input_tokens_seen": 83026705, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 0.640625, "step": 3858, "time_per_iteration": 2.3367738723754883 }, { "auxiliary_loss_clip": 0.01085803, "auxiliary_loss_mlp": 0.0103625, "balance_loss_clip": 1.0198946, "balance_loss_mlp": 1.02741051, "epoch": 0.23201563204569367, "flos": 23142876399360.0, "grad_norm": 2.0352397765961436, "language_loss": 0.76528925, "learning_rate": 3.492174150643746e-06, "loss": 0.78650975, "num_input_tokens_seen": 83046500, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.5859375, "step": 3859, "time_per_iteration": 2.3958740234375 }, { "auxiliary_loss_clip": 0.01083474, "auxiliary_loss_mlp": 0.01028742, "balance_loss_clip": 1.01096749, "balance_loss_mlp": 1.02491474, "epoch": 0.23207575529836164, "flos": 20666940577920.0, "grad_norm": 1.711314371365438, "language_loss": 0.84139782, "learning_rate": 3.4919225516314967e-06, "loss": 0.86251998, "num_input_tokens_seen": 83065280, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.5859375, "step": 3860, "time_per_iteration": 2.3686769008636475 }, { "auxiliary_loss_clip": 0.0108558, "auxiliary_loss_mlp": 0.01036474, "balance_loss_clip": 1.01955855, "balance_loss_mlp": 1.02602863, "epoch": 0.2321358785510296, "flos": 16471253272320.0, "grad_norm": 2.2656897178590616, "language_loss": 0.83054185, "learning_rate": 3.491670899376369e-06, "loss": 0.85176235, "num_input_tokens_seen": 83082310, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.59375, "step": 3861, "time_per_iteration": 2.350154399871826 }, { "auxiliary_loss_clip": 0.01085996, "auxiliary_loss_mlp": 0.01035652, "balance_loss_clip": 1.01777101, "balance_loss_mlp": 1.02532959, "epoch": 0.2321960018036976, "flos": 21615251460480.0, "grad_norm": 1.4941762555325888, "language_loss": 0.85749722, "learning_rate": 3.491419193887344e-06, "loss": 0.87871373, "num_input_tokens_seen": 83102065, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.609375, "step": 3862, "time_per_iteration": 2.37319278717041 }, { "auxiliary_loss_clip": 0.01084899, "auxiliary_loss_mlp": 0.01035926, "balance_loss_clip": 1.01992798, "balance_loss_mlp": 1.02606058, "epoch": 0.23225612505636556, "flos": 22270431634560.0, "grad_norm": 1.3822871416728404, "language_loss": 0.74682367, "learning_rate": 3.4911674351734036e-06, "loss": 0.76803184, "num_input_tokens_seen": 83121445, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.58984375, "step": 3863, "time_per_iteration": 2.3927905559539795 }, { "auxiliary_loss_clip": 0.01086858, "auxiliary_loss_mlp": 0.01034666, "balance_loss_clip": 1.01740432, "balance_loss_mlp": 1.02881217, "epoch": 0.23231624830903352, "flos": 17051475024000.0, "grad_norm": 1.7194030568120946, "language_loss": 0.7429074, "learning_rate": 3.490915623243534e-06, "loss": 0.76412261, "num_input_tokens_seen": 83138175, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.58203125, "step": 3864, "time_per_iteration": 2.334650754928589 }, { "auxiliary_loss_clip": 0.01085904, "auxiliary_loss_mlp": 0.01028943, "balance_loss_clip": 1.01183629, "balance_loss_mlp": 1.02545595, "epoch": 0.2323763715617015, "flos": 34638657949440.0, "grad_norm": 1.6110785405592347, "language_loss": 0.70623219, "learning_rate": 3.490663758106721e-06, "loss": 0.72738063, "num_input_tokens_seen": 83161975, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.60546875, "step": 3865, "time_per_iteration": 2.5042667388916016 }, { "auxiliary_loss_clip": 0.01093552, "auxiliary_loss_mlp": 0.01043105, "balance_loss_clip": 1.02183831, "balance_loss_mlp": 1.02737474, "epoch": 0.23243649481436945, "flos": 25550661513600.0, "grad_norm": 1.8160159251501384, "language_loss": 0.95263124, "learning_rate": 3.4904118397719527e-06, "loss": 0.97399777, "num_input_tokens_seen": 83180905, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 0.66015625, "step": 3866, "time_per_iteration": 2.4118688106536865 }, { "auxiliary_loss_clip": 0.01084664, "auxiliary_loss_mlp": 0.01035237, "balance_loss_clip": 1.01782107, "balance_loss_mlp": 1.02631032, "epoch": 0.23249661806703742, "flos": 20482494531840.0, "grad_norm": 2.6971448797499766, "language_loss": 0.7372874, "learning_rate": 3.4901598682482198e-06, "loss": 0.75848639, "num_input_tokens_seen": 83196390, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.58203125, "step": 3867, "time_per_iteration": 2.367206573486328 }, { "auxiliary_loss_clip": 0.0108622, "auxiliary_loss_mlp": 0.01038454, "balance_loss_clip": 1.02017939, "balance_loss_mlp": 1.02629495, "epoch": 0.23255674131970538, "flos": 20375555437440.0, "grad_norm": 1.6609717679460123, "language_loss": 0.82445127, "learning_rate": 3.489907843544514e-06, "loss": 0.845698, "num_input_tokens_seen": 83216165, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.59765625, "step": 3868, "time_per_iteration": 2.369361639022827 }, { "auxiliary_loss_clip": 0.01084422, "auxiliary_loss_mlp": 0.01033168, "balance_loss_clip": 1.01701486, "balance_loss_mlp": 1.02724826, "epoch": 0.23261686457237338, "flos": 17055140716800.0, "grad_norm": 59.581132554126874, "language_loss": 0.72886205, "learning_rate": 3.48965576566983e-06, "loss": 0.75003791, "num_input_tokens_seen": 83233845, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.5703125, "step": 3869, "time_per_iteration": 2.351043701171875 }, { "auxiliary_loss_clip": 0.01086381, "auxiliary_loss_mlp": 0.01036553, "balance_loss_clip": 1.01924992, "balance_loss_mlp": 1.02770185, "epoch": 0.23267698782504134, "flos": 29168570862720.0, "grad_norm": 1.7408269110206762, "language_loss": 0.7938329, "learning_rate": 3.4894036346331633e-06, "loss": 0.81506222, "num_input_tokens_seen": 83254930, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.5859375, "step": 3870, "time_per_iteration": 2.4447643756866455 }, { "auxiliary_loss_clip": 0.0109148, "auxiliary_loss_mlp": 0.01036928, "balance_loss_clip": 1.01787853, "balance_loss_mlp": 1.02846575, "epoch": 0.2327371110777093, "flos": 21173705095680.0, "grad_norm": 1.785973332768856, "language_loss": 0.70797658, "learning_rate": 3.4891514504435122e-06, "loss": 0.72926068, "num_input_tokens_seen": 83272095, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.6328125, "step": 3871, "time_per_iteration": 2.384000062942505 }, { "auxiliary_loss_clip": 0.01091167, "auxiliary_loss_mlp": 0.01050932, "balance_loss_clip": 1.03105998, "balance_loss_mlp": 1.02725577, "epoch": 0.23279723433037727, "flos": 24861964567680.0, "grad_norm": 1.9030353513864107, "language_loss": 0.68676955, "learning_rate": 3.488899213109877e-06, "loss": 0.7081905, "num_input_tokens_seen": 83290980, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.63671875, "step": 3872, "time_per_iteration": 2.4129700660705566 }, { "auxiliary_loss_clip": 0.01092725, "auxiliary_loss_mlp": 0.01037721, "balance_loss_clip": 1.01759875, "balance_loss_mlp": 1.02817512, "epoch": 0.23285735758304524, "flos": 38799082915200.0, "grad_norm": 1.5032490787504662, "language_loss": 0.77853107, "learning_rate": 3.4886469226412574e-06, "loss": 0.79983556, "num_input_tokens_seen": 83315175, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.6484375, "step": 3873, "time_per_iteration": 2.5239038467407227 }, { "auxiliary_loss_clip": 0.0101591, "auxiliary_loss_mlp": 0.010095, "balance_loss_clip": 1.00760484, "balance_loss_mlp": 1.00248373, "epoch": 0.2329174808357132, "flos": 53941086927360.0, "grad_norm": 0.8483968145996765, "language_loss": 0.60481763, "learning_rate": 3.48839457904666e-06, "loss": 0.62507164, "num_input_tokens_seen": 83372060, "router_z_loss_clip": 0.0189209, "router_z_loss_mlp": 0.13476562, "step": 3874, "time_per_iteration": 2.94453501701355 }, { "auxiliary_loss_clip": 0.01088082, "auxiliary_loss_mlp": 0.01041089, "balance_loss_clip": 1.02206349, "balance_loss_mlp": 1.0273155, "epoch": 0.23297760408838117, "flos": 21214937278080.0, "grad_norm": 3.6098683362584985, "language_loss": 0.80544692, "learning_rate": 3.488142182335088e-06, "loss": 0.82673866, "num_input_tokens_seen": 83389795, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.609375, "step": 3875, "time_per_iteration": 2.3684475421905518 }, { "auxiliary_loss_clip": 0.01088315, "auxiliary_loss_mlp": 0.01032235, "balance_loss_clip": 1.01553428, "balance_loss_mlp": 1.02850652, "epoch": 0.23303772734104916, "flos": 28401738560640.0, "grad_norm": 1.8928251504185174, "language_loss": 0.61316186, "learning_rate": 3.4878897325155493e-06, "loss": 0.63436735, "num_input_tokens_seen": 83410005, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.59765625, "step": 3876, "time_per_iteration": 2.4443576335906982 }, { "auxiliary_loss_clip": 0.01091508, "auxiliary_loss_mlp": 0.01041779, "balance_loss_clip": 1.02286053, "balance_loss_mlp": 1.02828074, "epoch": 0.23309785059371713, "flos": 24313618753920.0, "grad_norm": 1.8395140767558795, "language_loss": 0.70228851, "learning_rate": 3.4876372295970533e-06, "loss": 0.72362137, "num_input_tokens_seen": 83430250, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.6328125, "step": 3877, "time_per_iteration": 2.39897084236145 }, { "auxiliary_loss_clip": 0.01090161, "auxiliary_loss_mlp": 0.01050481, "balance_loss_clip": 1.03028655, "balance_loss_mlp": 1.02755427, "epoch": 0.2331579738463851, "flos": 15992140417920.0, "grad_norm": 2.1233250462097635, "language_loss": 0.80935645, "learning_rate": 3.4873846735886113e-06, "loss": 0.8307628, "num_input_tokens_seen": 83447950, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.625, "step": 3878, "time_per_iteration": 2.351627826690674 }, { "auxiliary_loss_clip": 0.01093163, "auxiliary_loss_mlp": 0.01041984, "balance_loss_clip": 1.02279115, "balance_loss_mlp": 1.02850604, "epoch": 0.23321809709905306, "flos": 36425547711360.0, "grad_norm": 1.592302194329684, "language_loss": 0.75221008, "learning_rate": 3.487132064499237e-06, "loss": 0.7735616, "num_input_tokens_seen": 83467785, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.64453125, "step": 3879, "time_per_iteration": 2.490724802017212 }, { "auxiliary_loss_clip": 0.01089623, "auxiliary_loss_mlp": 0.01038217, "balance_loss_clip": 1.01984656, "balance_loss_mlp": 1.02678251, "epoch": 0.23327822035172102, "flos": 21323691763200.0, "grad_norm": 1.9218040636305243, "language_loss": 0.8951329, "learning_rate": 3.4868794023379433e-06, "loss": 0.91641128, "num_input_tokens_seen": 83485390, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.62890625, "step": 3880, "time_per_iteration": 2.3741278648376465 }, { "auxiliary_loss_clip": 0.01091293, "auxiliary_loss_mlp": 0.01035241, "balance_loss_clip": 1.01701427, "balance_loss_mlp": 1.02880943, "epoch": 0.233338343604389, "flos": 19170877374720.0, "grad_norm": 1.6104074644906894, "language_loss": 0.71677834, "learning_rate": 3.4866266871137495e-06, "loss": 0.73804367, "num_input_tokens_seen": 83504890, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.625, "step": 3881, "time_per_iteration": 2.383903741836548 }, { "auxiliary_loss_clip": 0.01085002, "auxiliary_loss_mlp": 0.01037509, "balance_loss_clip": 1.01941347, "balance_loss_mlp": 1.02560568, "epoch": 0.23339846685705698, "flos": 26907106723200.0, "grad_norm": 1.5624443064870686, "language_loss": 0.68101043, "learning_rate": 3.486373918835673e-06, "loss": 0.70223552, "num_input_tokens_seen": 83526475, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.59375, "step": 3882, "time_per_iteration": 2.4449117183685303 }, { "auxiliary_loss_clip": 0.01088653, "auxiliary_loss_mlp": 0.01037966, "balance_loss_clip": 1.01885712, "balance_loss_mlp": 1.02729297, "epoch": 0.23345859010972494, "flos": 32341791824640.0, "grad_norm": 1.8848536996230683, "language_loss": 0.76615065, "learning_rate": 3.486121097512735e-06, "loss": 0.78741682, "num_input_tokens_seen": 83546620, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.6171875, "step": 3883, "time_per_iteration": 2.4524500370025635 }, { "auxiliary_loss_clip": 0.01015285, "auxiliary_loss_mlp": 0.01002841, "balance_loss_clip": 1.0010047, "balance_loss_mlp": 1.00191069, "epoch": 0.2335187133623929, "flos": 58480633013760.0, "grad_norm": 0.777824197148817, "language_loss": 0.59107447, "learning_rate": 3.4858682231539575e-06, "loss": 0.61125576, "num_input_tokens_seen": 83616160, "router_z_loss_clip": 0.01831055, "router_z_loss_mlp": 0.13378906, "step": 3884, "time_per_iteration": 3.1615242958068848 }, { "auxiliary_loss_clip": 0.01087233, "auxiliary_loss_mlp": 0.01035903, "balance_loss_clip": 1.0174377, "balance_loss_mlp": 1.02715921, "epoch": 0.23357883661506088, "flos": 24501067176960.0, "grad_norm": 1.6848883906985879, "language_loss": 0.8042841, "learning_rate": 3.4856152957683654e-06, "loss": 0.82551551, "num_input_tokens_seen": 83636795, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.6015625, "step": 3885, "time_per_iteration": 3.812267780303955 }, { "auxiliary_loss_clip": 0.01088251, "auxiliary_loss_mlp": 0.01038334, "balance_loss_clip": 1.01920128, "balance_loss_mlp": 1.02714074, "epoch": 0.23363895986772884, "flos": 18947642941440.0, "grad_norm": 2.032075325271931, "language_loss": 0.88071245, "learning_rate": 3.4853623153649843e-06, "loss": 0.90197825, "num_input_tokens_seen": 83654050, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.609375, "step": 3886, "time_per_iteration": 2.342637777328491 }, { "auxiliary_loss_clip": 0.01092208, "auxiliary_loss_mlp": 0.01035319, "balance_loss_clip": 1.01576889, "balance_loss_mlp": 1.02930665, "epoch": 0.2336990831203968, "flos": 31685459575680.0, "grad_norm": 1.705614046026682, "language_loss": 0.72942907, "learning_rate": 3.4851092819528434e-06, "loss": 0.75070429, "num_input_tokens_seen": 83673720, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.62890625, "step": 3887, "time_per_iteration": 2.468019485473633 }, { "auxiliary_loss_clip": 0.01090694, "auxiliary_loss_mlp": 0.010366, "balance_loss_clip": 1.01852798, "balance_loss_mlp": 1.02882934, "epoch": 0.23375920637306477, "flos": 27708503137920.0, "grad_norm": 1.7049365643049947, "language_loss": 0.83646351, "learning_rate": 3.4848561955409723e-06, "loss": 0.85773647, "num_input_tokens_seen": 83693470, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.6171875, "step": 3888, "time_per_iteration": 3.776660442352295 }, { "auxiliary_loss_clip": 0.01088951, "auxiliary_loss_mlp": 0.01040598, "balance_loss_clip": 1.02158463, "balance_loss_mlp": 1.02729416, "epoch": 0.23381932962573276, "flos": 17674674526080.0, "grad_norm": 2.582089726648754, "language_loss": 0.8758015, "learning_rate": 3.4846030561384036e-06, "loss": 0.89709705, "num_input_tokens_seen": 83711620, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.6171875, "step": 3889, "time_per_iteration": 2.367008924484253 }, { "auxiliary_loss_clip": 0.01090311, "auxiliary_loss_mlp": 0.01035352, "balance_loss_clip": 1.01644588, "balance_loss_mlp": 1.02713513, "epoch": 0.23387945287840073, "flos": 14390010904320.0, "grad_norm": 6.79406742982316, "language_loss": 0.76318294, "learning_rate": 3.48434986375417e-06, "loss": 0.78443956, "num_input_tokens_seen": 83727890, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.6328125, "step": 3890, "time_per_iteration": 3.715498924255371 }, { "auxiliary_loss_clip": 0.01089352, "auxiliary_loss_mlp": 0.01032187, "balance_loss_clip": 1.01415098, "balance_loss_mlp": 1.02785158, "epoch": 0.2339395761310687, "flos": 46096244605440.0, "grad_norm": 1.6402951909836185, "language_loss": 0.73124486, "learning_rate": 3.4840966183973085e-06, "loss": 0.75246024, "num_input_tokens_seen": 83749370, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.6171875, "step": 3891, "time_per_iteration": 2.5878403186798096 }, { "auxiliary_loss_clip": 0.01084362, "auxiliary_loss_mlp": 0.01034546, "balance_loss_clip": 1.0168438, "balance_loss_mlp": 1.0261457, "epoch": 0.23399969938373666, "flos": 22380966599040.0, "grad_norm": 1.624153552412621, "language_loss": 0.82987958, "learning_rate": 3.483843320076856e-06, "loss": 0.85106862, "num_input_tokens_seen": 83769560, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.58203125, "step": 3892, "time_per_iteration": 2.4133431911468506 }, { "auxiliary_loss_clip": 0.01089271, "auxiliary_loss_mlp": 0.01039675, "balance_loss_clip": 1.02114964, "balance_loss_mlp": 1.02633345, "epoch": 0.23405982263640462, "flos": 43506841265280.0, "grad_norm": 1.5790185781033186, "language_loss": 0.64797843, "learning_rate": 3.4835899688018522e-06, "loss": 0.66926789, "num_input_tokens_seen": 83795635, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.62890625, "step": 3893, "time_per_iteration": 2.57216477394104 }, { "auxiliary_loss_clip": 0.01087905, "auxiliary_loss_mlp": 0.01036994, "balance_loss_clip": 1.01685905, "balance_loss_mlp": 1.02709126, "epoch": 0.2341199458890726, "flos": 22563597254400.0, "grad_norm": 1.9864293071099186, "language_loss": 0.79282415, "learning_rate": 3.4833365645813384e-06, "loss": 0.81407309, "num_input_tokens_seen": 83814090, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.609375, "step": 3894, "time_per_iteration": 3.8073062896728516 }, { "auxiliary_loss_clip": 0.010865, "auxiliary_loss_mlp": 0.01032128, "balance_loss_clip": 1.01431835, "balance_loss_mlp": 1.02638721, "epoch": 0.23418006914174055, "flos": 25632672030720.0, "grad_norm": 1.399182172015132, "language_loss": 0.81676078, "learning_rate": 3.483083107424359e-06, "loss": 0.83794707, "num_input_tokens_seen": 83836870, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.6015625, "step": 3895, "time_per_iteration": 2.4315969944000244 }, { "auxiliary_loss_clip": 0.0108965, "auxiliary_loss_mlp": 0.01042139, "balance_loss_clip": 1.02311325, "balance_loss_mlp": 1.02690482, "epoch": 0.23424019239440855, "flos": 13545287625600.0, "grad_norm": 2.4338500250449786, "language_loss": 0.80449915, "learning_rate": 3.4828295973399576e-06, "loss": 0.82581705, "num_input_tokens_seen": 83853275, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.62890625, "step": 3896, "time_per_iteration": 2.3407487869262695 }, { "auxiliary_loss_clip": 0.01089227, "auxiliary_loss_mlp": 0.01039079, "balance_loss_clip": 1.01896858, "balance_loss_mlp": 1.02640009, "epoch": 0.2343003156470765, "flos": 22418393443200.0, "grad_norm": 1.6071938501328131, "language_loss": 0.83172464, "learning_rate": 3.4825760343371826e-06, "loss": 0.85300767, "num_input_tokens_seen": 83872340, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.62890625, "step": 3897, "time_per_iteration": 2.3793880939483643 }, { "auxiliary_loss_clip": 0.01090787, "auxiliary_loss_mlp": 0.01040324, "balance_loss_clip": 1.02080941, "balance_loss_mlp": 1.0267725, "epoch": 0.23436043889974448, "flos": 14790010884480.0, "grad_norm": 1.5581287520768663, "language_loss": 0.79288226, "learning_rate": 3.482322418425083e-06, "loss": 0.81419337, "num_input_tokens_seen": 83888795, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.640625, "step": 3898, "time_per_iteration": 2.358301877975464 }, { "auxiliary_loss_clip": 0.01088283, "auxiliary_loss_mlp": 0.01036677, "balance_loss_clip": 1.01794958, "balance_loss_mlp": 1.02875996, "epoch": 0.23442056215241244, "flos": 22964609664000.0, "grad_norm": 2.0556150322534488, "language_loss": 0.73653591, "learning_rate": 3.4820687496127086e-06, "loss": 0.7577855, "num_input_tokens_seen": 83906820, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.59375, "step": 3899, "time_per_iteration": 2.374284029006958 }, { "auxiliary_loss_clip": 0.01089848, "auxiliary_loss_mlp": 0.01035129, "balance_loss_clip": 1.01480436, "balance_loss_mlp": 1.02681553, "epoch": 0.2344806854050804, "flos": 23070885442560.0, "grad_norm": 1.7523844490759546, "language_loss": 0.75317299, "learning_rate": 3.481815027909113e-06, "loss": 0.77442276, "num_input_tokens_seen": 83926370, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.62890625, "step": 3900, "time_per_iteration": 2.3924379348754883 }, { "auxiliary_loss_clip": 0.01086836, "auxiliary_loss_mlp": 0.01044223, "balance_loss_clip": 1.02541137, "balance_loss_mlp": 1.02613676, "epoch": 0.23454080865774837, "flos": 16326119283840.0, "grad_norm": 1.8746186612364086, "language_loss": 0.67176574, "learning_rate": 3.481561253323351e-06, "loss": 0.69307637, "num_input_tokens_seen": 83944600, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.60546875, "step": 3901, "time_per_iteration": 2.3454980850219727 }, { "auxiliary_loss_clip": 0.01016168, "auxiliary_loss_mlp": 0.01003361, "balance_loss_clip": 1.00156093, "balance_loss_mlp": 1.00265777, "epoch": 0.23460093191041637, "flos": 67757860218240.0, "grad_norm": 0.7569055479015939, "language_loss": 0.58216643, "learning_rate": 3.4813074258644786e-06, "loss": 0.60236168, "num_input_tokens_seen": 84005100, "router_z_loss_clip": 0.01794434, "router_z_loss_mlp": 0.13476562, "step": 3902, "time_per_iteration": 2.9768431186676025 }, { "auxiliary_loss_clip": 0.01089177, "auxiliary_loss_mlp": 0.01041788, "balance_loss_clip": 1.02316713, "balance_loss_mlp": 1.02695894, "epoch": 0.23466105516308433, "flos": 20076769088640.0, "grad_norm": 1.847964685177598, "language_loss": 0.80235386, "learning_rate": 3.4810535455415547e-06, "loss": 0.82366347, "num_input_tokens_seen": 84023775, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.62109375, "step": 3903, "time_per_iteration": 2.3543636798858643 }, { "auxiliary_loss_clip": 0.01087197, "auxiliary_loss_mlp": 0.01035546, "balance_loss_clip": 1.01584053, "balance_loss_mlp": 1.02472389, "epoch": 0.2347211784157523, "flos": 24534549037440.0, "grad_norm": 1.847324200959818, "language_loss": 0.82054985, "learning_rate": 3.4807996123636394e-06, "loss": 0.84177727, "num_input_tokens_seen": 84042605, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.625, "step": 3904, "time_per_iteration": 2.3945391178131104 }, { "auxiliary_loss_clip": 0.01087658, "auxiliary_loss_mlp": 0.01038302, "balance_loss_clip": 1.01990783, "balance_loss_mlp": 1.02744102, "epoch": 0.23478130166842026, "flos": 23803921681920.0, "grad_norm": 1.8055937303680838, "language_loss": 0.71191037, "learning_rate": 3.4805456263397954e-06, "loss": 0.73316991, "num_input_tokens_seen": 84061520, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.6015625, "step": 3905, "time_per_iteration": 2.3860695362091064 }, { "auxiliary_loss_clip": 0.01085371, "auxiliary_loss_mlp": 0.01036509, "balance_loss_clip": 1.0176506, "balance_loss_mlp": 1.02658606, "epoch": 0.23484142492108823, "flos": 24092583736320.0, "grad_norm": 1.705053924116823, "language_loss": 0.7110635, "learning_rate": 3.480291587479086e-06, "loss": 0.73228228, "num_input_tokens_seen": 84081800, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.58984375, "step": 3906, "time_per_iteration": 2.4205853939056396 }, { "auxiliary_loss_clip": 0.01089727, "auxiliary_loss_mlp": 0.01035227, "balance_loss_clip": 1.01469874, "balance_loss_mlp": 1.02454805, "epoch": 0.2349015481737562, "flos": 29094555047040.0, "grad_norm": 1.8734093881306393, "language_loss": 0.73802781, "learning_rate": 3.4800374957905777e-06, "loss": 0.75927734, "num_input_tokens_seen": 84102340, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.65234375, "step": 3907, "time_per_iteration": 2.4347403049468994 }, { "auxiliary_loss_clip": 0.01089199, "auxiliary_loss_mlp": 0.01046658, "balance_loss_clip": 1.02795398, "balance_loss_mlp": 1.02631307, "epoch": 0.23496167142642416, "flos": 18915313155840.0, "grad_norm": 1.6152982109399794, "language_loss": 0.7262612, "learning_rate": 3.4797833512833376e-06, "loss": 0.74761975, "num_input_tokens_seen": 84120370, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.62890625, "step": 3908, "time_per_iteration": 2.3700857162475586 }, { "auxiliary_loss_clip": 0.01014905, "auxiliary_loss_mlp": 0.01001075, "balance_loss_clip": 0.99928665, "balance_loss_mlp": 1.00184536, "epoch": 0.23502179467909215, "flos": 55865255621760.0, "grad_norm": 1.0311515853158357, "language_loss": 0.73314607, "learning_rate": 3.479529153966437e-06, "loss": 0.75330579, "num_input_tokens_seen": 84165515, "router_z_loss_clip": 0.01782227, "router_z_loss_mlp": 0.13085938, "step": 3909, "time_per_iteration": 2.7518317699432373 }, { "auxiliary_loss_clip": 0.01085447, "auxiliary_loss_mlp": 0.01042184, "balance_loss_clip": 1.02412355, "balance_loss_mlp": 1.02521765, "epoch": 0.23508191793176011, "flos": 23400709856640.0, "grad_norm": 1.610049055586779, "language_loss": 0.8800478, "learning_rate": 3.479274903848947e-06, "loss": 0.90132415, "num_input_tokens_seen": 84184540, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.6015625, "step": 3910, "time_per_iteration": 2.4032609462738037 }, { "auxiliary_loss_clip": 0.01089212, "auxiliary_loss_mlp": 0.01036128, "balance_loss_clip": 1.0178293, "balance_loss_mlp": 1.02690864, "epoch": 0.23514204118442808, "flos": 20046638718720.0, "grad_norm": 2.4689970801754817, "language_loss": 0.76217383, "learning_rate": 3.4790206009399396e-06, "loss": 0.78342724, "num_input_tokens_seen": 84202025, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.625, "step": 3911, "time_per_iteration": 2.3631575107574463 }, { "auxiliary_loss_clip": 0.01086303, "auxiliary_loss_mlp": 0.01035563, "balance_loss_clip": 1.01840854, "balance_loss_mlp": 1.02771223, "epoch": 0.23520216443709605, "flos": 21579500361600.0, "grad_norm": 1.5249697522270447, "language_loss": 0.82007813, "learning_rate": 3.4787662452484923e-06, "loss": 0.84129679, "num_input_tokens_seen": 84221895, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.5859375, "step": 3912, "time_per_iteration": 2.392092227935791 }, { "auxiliary_loss_clip": 0.01086741, "auxiliary_loss_mlp": 0.01046849, "balance_loss_clip": 1.02833557, "balance_loss_mlp": 1.02640569, "epoch": 0.235262287689764, "flos": 23184667163520.0, "grad_norm": 1.9560258940231139, "language_loss": 0.71403623, "learning_rate": 3.4785118367836816e-06, "loss": 0.73537213, "num_input_tokens_seen": 84240455, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.6015625, "step": 3913, "time_per_iteration": 2.378814697265625 }, { "auxiliary_loss_clip": 0.01092186, "auxiliary_loss_mlp": 0.01038486, "balance_loss_clip": 1.01817274, "balance_loss_mlp": 1.02705812, "epoch": 0.23532241094243198, "flos": 23184108581760.0, "grad_norm": 1.590253749853247, "language_loss": 0.76320422, "learning_rate": 3.4782573755545866e-06, "loss": 0.78451097, "num_input_tokens_seen": 84261605, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.65234375, "step": 3914, "time_per_iteration": 2.3976385593414307 }, { "auxiliary_loss_clip": 0.01088772, "auxiliary_loss_mlp": 0.01036684, "balance_loss_clip": 1.01784861, "balance_loss_mlp": 1.02729559, "epoch": 0.23538253419509997, "flos": 17018377188480.0, "grad_norm": 2.153764461088028, "language_loss": 0.89869112, "learning_rate": 3.478002861570288e-06, "loss": 0.91994566, "num_input_tokens_seen": 84278675, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.61328125, "step": 3915, "time_per_iteration": 2.331655740737915 }, { "auxiliary_loss_clip": 0.01014715, "auxiliary_loss_mlp": 0.01002028, "balance_loss_clip": 1.00016809, "balance_loss_mlp": 1.00118518, "epoch": 0.23544265744776793, "flos": 63445807751040.0, "grad_norm": 0.8060170169430493, "language_loss": 0.59416699, "learning_rate": 3.47774829483987e-06, "loss": 0.61433446, "num_input_tokens_seen": 84329765, "router_z_loss_clip": 0.01855469, "router_z_loss_mlp": 0.13574219, "step": 3916, "time_per_iteration": 2.953676700592041 }, { "auxiliary_loss_clip": 0.01014091, "auxiliary_loss_mlp": 0.0100179, "balance_loss_clip": 0.999942, "balance_loss_mlp": 1.00077128, "epoch": 0.2355027807004359, "flos": 70511668617600.0, "grad_norm": 0.8909998172565021, "language_loss": 0.49440813, "learning_rate": 3.4774936753724156e-06, "loss": 0.51456696, "num_input_tokens_seen": 84393680, "router_z_loss_clip": 0.01843262, "router_z_loss_mlp": 0.1328125, "step": 3917, "time_per_iteration": 3.037407398223877 }, { "auxiliary_loss_clip": 0.01092182, "auxiliary_loss_mlp": 0.01038689, "balance_loss_clip": 1.02008009, "balance_loss_mlp": 1.02693367, "epoch": 0.23556290395310386, "flos": 21433214298240.0, "grad_norm": 2.0099807227836446, "language_loss": 0.76812083, "learning_rate": 3.4772390031770126e-06, "loss": 0.78942955, "num_input_tokens_seen": 84412640, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.65234375, "step": 3918, "time_per_iteration": 2.3877439498901367 }, { "auxiliary_loss_clip": 0.01093324, "auxiliary_loss_mlp": 0.01036577, "balance_loss_clip": 1.01768243, "balance_loss_mlp": 1.02766883, "epoch": 0.23562302720577183, "flos": 18185453850240.0, "grad_norm": 1.8471874145199307, "language_loss": 0.69392735, "learning_rate": 3.47698427826275e-06, "loss": 0.71522635, "num_input_tokens_seen": 84431605, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.65625, "step": 3919, "time_per_iteration": 2.3740577697753906 }, { "auxiliary_loss_clip": 0.01086102, "auxiliary_loss_mlp": 0.01033642, "balance_loss_clip": 1.01590323, "balance_loss_mlp": 1.02619362, "epoch": 0.2356831504584398, "flos": 33729065631360.0, "grad_norm": 1.6244493627930483, "language_loss": 0.70738161, "learning_rate": 3.4767295006387174e-06, "loss": 0.72857904, "num_input_tokens_seen": 84454210, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.6015625, "step": 3920, "time_per_iteration": 2.4963719844818115 }, { "auxiliary_loss_clip": 0.01088191, "auxiliary_loss_mlp": 0.01041816, "balance_loss_clip": 1.02422094, "balance_loss_mlp": 1.02713609, "epoch": 0.23574327371110776, "flos": 24931721197440.0, "grad_norm": 1.505679621024444, "language_loss": 0.7673465, "learning_rate": 3.4764746703140077e-06, "loss": 0.78864658, "num_input_tokens_seen": 84475540, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.609375, "step": 3921, "time_per_iteration": 2.418783664703369 }, { "auxiliary_loss_clip": 0.01088373, "auxiliary_loss_mlp": 0.01041809, "balance_loss_clip": 1.02302134, "balance_loss_mlp": 1.02737713, "epoch": 0.23580339696377575, "flos": 17821135146240.0, "grad_norm": 2.095434254877917, "language_loss": 0.75107485, "learning_rate": 3.476219787297715e-06, "loss": 0.77237666, "num_input_tokens_seen": 84494580, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.609375, "step": 3922, "time_per_iteration": 2.382678508758545 }, { "auxiliary_loss_clip": 0.01085767, "auxiliary_loss_mlp": 0.01034456, "balance_loss_clip": 1.0165149, "balance_loss_mlp": 1.02481282, "epoch": 0.23586352021644372, "flos": 26285408409600.0, "grad_norm": 1.9576847834165292, "language_loss": 0.80365449, "learning_rate": 3.4759648515989356e-06, "loss": 0.82485676, "num_input_tokens_seen": 84513850, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.609375, "step": 3923, "time_per_iteration": 2.4159023761749268 }, { "auxiliary_loss_clip": 0.01084837, "auxiliary_loss_mlp": 0.01042515, "balance_loss_clip": 1.0240252, "balance_loss_mlp": 1.02530074, "epoch": 0.23592364346911168, "flos": 14245819522560.0, "grad_norm": 2.599898710072028, "language_loss": 0.74348283, "learning_rate": 3.4757098632267663e-06, "loss": 0.76475632, "num_input_tokens_seen": 84532315, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.59375, "step": 3924, "time_per_iteration": 3.7444300651550293 }, { "auxiliary_loss_clip": 0.01089198, "auxiliary_loss_mlp": 0.01035378, "balance_loss_clip": 1.0174135, "balance_loss_mlp": 1.02811205, "epoch": 0.23598376672177965, "flos": 18586955018880.0, "grad_norm": 1.565306928828022, "language_loss": 0.82739925, "learning_rate": 3.4754548221903086e-06, "loss": 0.84864497, "num_input_tokens_seen": 84550970, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.609375, "step": 3925, "time_per_iteration": 2.3710172176361084 }, { "auxiliary_loss_clip": 0.01091045, "auxiliary_loss_mlp": 0.01042492, "balance_loss_clip": 1.02396703, "balance_loss_mlp": 1.02750456, "epoch": 0.2360438899744476, "flos": 22674411509760.0, "grad_norm": 1.61479247563879, "language_loss": 0.59434247, "learning_rate": 3.475199728498664e-06, "loss": 0.61567783, "num_input_tokens_seen": 84571655, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.6328125, "step": 3926, "time_per_iteration": 2.402881383895874 }, { "auxiliary_loss_clip": 0.01082674, "auxiliary_loss_mlp": 0.01032391, "balance_loss_clip": 1.0156306, "balance_loss_mlp": 1.02566981, "epoch": 0.23610401322711558, "flos": 29568850133760.0, "grad_norm": 1.9188225539479868, "language_loss": 0.71234751, "learning_rate": 3.474944582160935e-06, "loss": 0.7334981, "num_input_tokens_seen": 84593130, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.5703125, "step": 3927, "time_per_iteration": 2.434513568878174 }, { "auxiliary_loss_clip": 0.01087163, "auxiliary_loss_mlp": 0.01030594, "balance_loss_clip": 1.01372635, "balance_loss_mlp": 1.02647984, "epoch": 0.23616413647978354, "flos": 17857549560960.0, "grad_norm": 1.6424193366904158, "language_loss": 0.75205117, "learning_rate": 3.4746893831862287e-06, "loss": 0.77322876, "num_input_tokens_seen": 84612410, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.609375, "step": 3928, "time_per_iteration": 3.757261037826538 }, { "auxiliary_loss_clip": 0.01087093, "auxiliary_loss_mlp": 0.01041277, "balance_loss_clip": 1.02341962, "balance_loss_mlp": 1.02587724, "epoch": 0.23622425973245154, "flos": 11034089464320.0, "grad_norm": 5.534124166541476, "language_loss": 0.81882471, "learning_rate": 3.474434131583651e-06, "loss": 0.84010839, "num_input_tokens_seen": 84627610, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.609375, "step": 3929, "time_per_iteration": 3.7931859493255615 }, { "auxiliary_loss_clip": 0.01092217, "auxiliary_loss_mlp": 0.01038184, "balance_loss_clip": 1.01809752, "balance_loss_mlp": 1.02778602, "epoch": 0.2362843829851195, "flos": 23402944183680.0, "grad_norm": 1.7850875375794433, "language_loss": 0.7202158, "learning_rate": 3.474178827362312e-06, "loss": 0.74151981, "num_input_tokens_seen": 84648415, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.64453125, "step": 3930, "time_per_iteration": 2.425452470779419 }, { "auxiliary_loss_clip": 0.01086405, "auxiliary_loss_mlp": 0.01034044, "balance_loss_clip": 1.01482701, "balance_loss_mlp": 1.02544904, "epoch": 0.23634450623778747, "flos": 39528313816320.0, "grad_norm": 1.7034464418132031, "language_loss": 0.73838532, "learning_rate": 3.473923470531323e-06, "loss": 0.75958979, "num_input_tokens_seen": 84670080, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.609375, "step": 3931, "time_per_iteration": 2.5556492805480957 }, { "auxiliary_loss_clip": 0.01089561, "auxiliary_loss_mlp": 0.01034603, "balance_loss_clip": 1.01506436, "balance_loss_mlp": 1.02752662, "epoch": 0.23640462949045543, "flos": 24206016343680.0, "grad_norm": 1.906088396646724, "language_loss": 0.80198288, "learning_rate": 3.4736680610997965e-06, "loss": 0.82322443, "num_input_tokens_seen": 84686465, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.62109375, "step": 3932, "time_per_iteration": 2.400491952896118 }, { "auxiliary_loss_clip": 0.01087914, "auxiliary_loss_mlp": 0.01036607, "balance_loss_clip": 1.01971507, "balance_loss_mlp": 1.0282166, "epoch": 0.2364647527431234, "flos": 26176409544960.0, "grad_norm": 1.8649058480496419, "language_loss": 0.85352182, "learning_rate": 3.4734125990768476e-06, "loss": 0.87476707, "num_input_tokens_seen": 84708825, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.59765625, "step": 3933, "time_per_iteration": 3.808643341064453 }, { "auxiliary_loss_clip": 0.01094796, "auxiliary_loss_mlp": 0.01035201, "balance_loss_clip": 1.01617527, "balance_loss_mlp": 1.03128254, "epoch": 0.23652487599579136, "flos": 22635937324800.0, "grad_norm": 2.366228012432977, "language_loss": 0.82972109, "learning_rate": 3.473157084471593e-06, "loss": 0.85102105, "num_input_tokens_seen": 84726165, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.63671875, "step": 3934, "time_per_iteration": 2.3947384357452393 }, { "auxiliary_loss_clip": 0.01089428, "auxiliary_loss_mlp": 0.01037945, "balance_loss_clip": 1.01931226, "balance_loss_mlp": 1.0274868, "epoch": 0.23658499924845935, "flos": 21761188410240.0, "grad_norm": 1.9397582946968468, "language_loss": 0.78524363, "learning_rate": 3.472901517293152e-06, "loss": 0.80651736, "num_input_tokens_seen": 84745815, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.6171875, "step": 3935, "time_per_iteration": 2.3859567642211914 }, { "auxiliary_loss_clip": 0.01088847, "auxiliary_loss_mlp": 0.01036016, "balance_loss_clip": 1.01830173, "balance_loss_mlp": 1.02928877, "epoch": 0.23664512250112732, "flos": 21797917027200.0, "grad_norm": 2.019692854853557, "language_loss": 0.79821914, "learning_rate": 3.472645897550644e-06, "loss": 0.81946778, "num_input_tokens_seen": 84765415, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.59375, "step": 3936, "time_per_iteration": 2.403423547744751 }, { "auxiliary_loss_clip": 0.01088847, "auxiliary_loss_mlp": 0.01038309, "balance_loss_clip": 1.02030861, "balance_loss_mlp": 1.0280695, "epoch": 0.23670524575379528, "flos": 22636775197440.0, "grad_norm": 1.7256088803435925, "language_loss": 0.79123998, "learning_rate": 3.4723902252531925e-06, "loss": 0.81251156, "num_input_tokens_seen": 84787080, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.609375, "step": 3937, "time_per_iteration": 2.4090025424957275 }, { "auxiliary_loss_clip": 0.01086062, "auxiliary_loss_mlp": 0.01036133, "balance_loss_clip": 1.0198313, "balance_loss_mlp": 1.02775407, "epoch": 0.23676536900646325, "flos": 16724129316480.0, "grad_norm": 1.7795879807522792, "language_loss": 0.85066378, "learning_rate": 3.472134500409921e-06, "loss": 0.87188578, "num_input_tokens_seen": 84805395, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.58203125, "step": 3938, "time_per_iteration": 2.3687632083892822 }, { "auxiliary_loss_clip": 0.01084712, "auxiliary_loss_mlp": 0.01042457, "balance_loss_clip": 1.024683, "balance_loss_mlp": 1.02572882, "epoch": 0.23682549225913122, "flos": 11135093627520.0, "grad_norm": 2.1358145243909723, "language_loss": 0.94085848, "learning_rate": 3.471878723029956e-06, "loss": 0.96213019, "num_input_tokens_seen": 84818090, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.58984375, "step": 3939, "time_per_iteration": 2.309140920639038 }, { "auxiliary_loss_clip": 0.01086833, "auxiliary_loss_mlp": 0.01036452, "balance_loss_clip": 1.0177362, "balance_loss_mlp": 1.0254091, "epoch": 0.23688561551179918, "flos": 22558290727680.0, "grad_norm": 1.570453226292029, "language_loss": 0.8218323, "learning_rate": 3.4716228931224253e-06, "loss": 0.84306526, "num_input_tokens_seen": 84837695, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.61328125, "step": 3940, "time_per_iteration": 2.395319700241089 }, { "auxiliary_loss_clip": 0.01092457, "auxiliary_loss_mlp": 0.0103986, "balance_loss_clip": 1.02151346, "balance_loss_mlp": 1.02784467, "epoch": 0.23694573876446715, "flos": 18513916721280.0, "grad_norm": 2.05952993011614, "language_loss": 0.89007425, "learning_rate": 3.4713670106964596e-06, "loss": 0.91139746, "num_input_tokens_seen": 84854630, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.6484375, "step": 3941, "time_per_iteration": 2.3423094749450684 }, { "auxiliary_loss_clip": 0.01086054, "auxiliary_loss_mlp": 0.01034234, "balance_loss_clip": 1.01593494, "balance_loss_mlp": 1.02536297, "epoch": 0.23700586201713514, "flos": 15334970296320.0, "grad_norm": 1.912205034027911, "language_loss": 0.84782934, "learning_rate": 3.4711110757611897e-06, "loss": 0.86903226, "num_input_tokens_seen": 84871805, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.60546875, "step": 3942, "time_per_iteration": 2.341895818710327 }, { "auxiliary_loss_clip": 0.0108676, "auxiliary_loss_mlp": 0.010326, "balance_loss_clip": 1.0150528, "balance_loss_mlp": 1.02570486, "epoch": 0.2370659852698031, "flos": 23946576963840.0, "grad_norm": 1.8294419142178115, "language_loss": 0.81383359, "learning_rate": 3.4708550883257496e-06, "loss": 0.83502716, "num_input_tokens_seen": 84889815, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.609375, "step": 3943, "time_per_iteration": 2.427168846130371 }, { "auxiliary_loss_clip": 0.01088801, "auxiliary_loss_mlp": 0.01034218, "balance_loss_clip": 1.01532364, "balance_loss_mlp": 1.02520633, "epoch": 0.23712610852247107, "flos": 15331863185280.0, "grad_norm": 9.09334172791455, "language_loss": 0.67308927, "learning_rate": 3.4705990483992746e-06, "loss": 0.69431949, "num_input_tokens_seen": 84904380, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.63671875, "step": 3944, "time_per_iteration": 2.3187944889068604 }, { "auxiliary_loss_clip": 0.0108925, "auxiliary_loss_mlp": 0.01039393, "balance_loss_clip": 1.01998532, "balance_loss_mlp": 1.02612245, "epoch": 0.23718623177513903, "flos": 19681551964800.0, "grad_norm": 1.671039651844236, "language_loss": 0.75468224, "learning_rate": 3.470342955990903e-06, "loss": 0.77596867, "num_input_tokens_seen": 84922935, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.6328125, "step": 3945, "time_per_iteration": 2.37501859664917 }, { "auxiliary_loss_clip": 0.01086521, "auxiliary_loss_mlp": 0.0103405, "balance_loss_clip": 1.01713407, "balance_loss_mlp": 1.02697992, "epoch": 0.237246355027807, "flos": 24972150418560.0, "grad_norm": 1.4428215725623434, "language_loss": 0.63798368, "learning_rate": 3.470086811109773e-06, "loss": 0.65918934, "num_input_tokens_seen": 84943685, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.59375, "step": 3946, "time_per_iteration": 2.4165070056915283 }, { "auxiliary_loss_clip": 0.01085289, "auxiliary_loss_mlp": 0.01034038, "balance_loss_clip": 1.01466691, "balance_loss_mlp": 1.02429485, "epoch": 0.23730647828047496, "flos": 15376516680960.0, "grad_norm": 3.4707146167044516, "language_loss": 0.77266467, "learning_rate": 3.469830613765026e-06, "loss": 0.79385793, "num_input_tokens_seen": 84959505, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.609375, "step": 3947, "time_per_iteration": 2.347360134124756 }, { "auxiliary_loss_clip": 0.01091679, "auxiliary_loss_mlp": 0.01034779, "balance_loss_clip": 1.01659989, "balance_loss_mlp": 1.02870166, "epoch": 0.23736660153314296, "flos": 28149316364160.0, "grad_norm": 1.4133151306437455, "language_loss": 0.80498493, "learning_rate": 3.4695743639658065e-06, "loss": 0.82624948, "num_input_tokens_seen": 84982130, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.6328125, "step": 3948, "time_per_iteration": 2.452707529067993 }, { "auxiliary_loss_clip": 0.01089203, "auxiliary_loss_mlp": 0.01036481, "balance_loss_clip": 1.0187546, "balance_loss_mlp": 1.02731228, "epoch": 0.23742672478581092, "flos": 22085601563520.0, "grad_norm": 1.7015740092423202, "language_loss": 0.80448139, "learning_rate": 3.4693180617212568e-06, "loss": 0.82573825, "num_input_tokens_seen": 85000640, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.6171875, "step": 3949, "time_per_iteration": 2.372466802597046 }, { "auxiliary_loss_clip": 0.01087709, "auxiliary_loss_mlp": 0.0103332, "balance_loss_clip": 1.01535463, "balance_loss_mlp": 1.02515912, "epoch": 0.2374868480384789, "flos": 19536068862720.0, "grad_norm": 1.7302396041739636, "language_loss": 0.73320466, "learning_rate": 3.4690617070405255e-06, "loss": 0.75441492, "num_input_tokens_seen": 85018970, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.625, "step": 3950, "time_per_iteration": 2.3847105503082275 }, { "auxiliary_loss_clip": 0.01082571, "auxiliary_loss_mlp": 0.01030741, "balance_loss_clip": 1.01471925, "balance_loss_mlp": 1.024845, "epoch": 0.23754697129114685, "flos": 19421623825920.0, "grad_norm": 1.8489448929771304, "language_loss": 0.7334522, "learning_rate": 3.4688052999327607e-06, "loss": 0.75458527, "num_input_tokens_seen": 85035905, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.578125, "step": 3951, "time_per_iteration": 2.384700059890747 }, { "auxiliary_loss_clip": 0.01090804, "auxiliary_loss_mlp": 0.01036593, "balance_loss_clip": 1.01792455, "balance_loss_mlp": 1.02802944, "epoch": 0.23760709454381482, "flos": 19499968650240.0, "grad_norm": 1.677284176615736, "language_loss": 0.73947823, "learning_rate": 3.4685488404071133e-06, "loss": 0.7607522, "num_input_tokens_seen": 85054560, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.62890625, "step": 3952, "time_per_iteration": 2.361034631729126 }, { "auxiliary_loss_clip": 0.0108799, "auxiliary_loss_mlp": 0.01036075, "balance_loss_clip": 1.01976752, "balance_loss_mlp": 1.02603579, "epoch": 0.23766721779648278, "flos": 27635360106240.0, "grad_norm": 1.5815686078865576, "language_loss": 0.71242404, "learning_rate": 3.468292328472735e-06, "loss": 0.73366463, "num_input_tokens_seen": 85074425, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.6171875, "step": 3953, "time_per_iteration": 2.4313089847564697 }, { "auxiliary_loss_clip": 0.01088287, "auxiliary_loss_mlp": 0.01037609, "balance_loss_clip": 1.01906037, "balance_loss_mlp": 1.02595127, "epoch": 0.23772734104915075, "flos": 23403223474560.0, "grad_norm": 1.7210777377658004, "language_loss": 0.81412822, "learning_rate": 3.468035764138781e-06, "loss": 0.83538717, "num_input_tokens_seen": 85092865, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.625, "step": 3954, "time_per_iteration": 2.399097442626953 }, { "auxiliary_loss_clip": 0.01088944, "auxiliary_loss_mlp": 0.01039018, "balance_loss_clip": 1.01989686, "balance_loss_mlp": 1.02651834, "epoch": 0.23778746430181874, "flos": 15704595527040.0, "grad_norm": 2.0400836021449007, "language_loss": 0.66015524, "learning_rate": 3.467779147414406e-06, "loss": 0.68143481, "num_input_tokens_seen": 85110175, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.625, "step": 3955, "time_per_iteration": 2.3557753562927246 }, { "auxiliary_loss_clip": 0.01087135, "auxiliary_loss_mlp": 0.01039377, "balance_loss_clip": 1.02224684, "balance_loss_mlp": 1.02688372, "epoch": 0.2378475875544867, "flos": 19425464075520.0, "grad_norm": 1.3208309825519677, "language_loss": 0.83773601, "learning_rate": 3.467522478308769e-06, "loss": 0.85900116, "num_input_tokens_seen": 85129925, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.6015625, "step": 3956, "time_per_iteration": 2.382812976837158 }, { "auxiliary_loss_clip": 0.0108376, "auxiliary_loss_mlp": 0.01033172, "balance_loss_clip": 1.01636314, "balance_loss_mlp": 1.02641165, "epoch": 0.23790771080715467, "flos": 22267603814400.0, "grad_norm": 2.0295910925093192, "language_loss": 0.84805679, "learning_rate": 3.46726575683103e-06, "loss": 0.8692261, "num_input_tokens_seen": 85147755, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.57421875, "step": 3957, "time_per_iteration": 2.378727674484253 }, { "auxiliary_loss_clip": 0.01088326, "auxiliary_loss_mlp": 0.01040559, "balance_loss_clip": 1.02335739, "balance_loss_mlp": 1.02813232, "epoch": 0.23796783405982264, "flos": 20046289605120.0, "grad_norm": 1.800800681265894, "language_loss": 0.69876516, "learning_rate": 3.4670089829903503e-06, "loss": 0.72005403, "num_input_tokens_seen": 85165270, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.6015625, "step": 3958, "time_per_iteration": 2.3807365894317627 }, { "auxiliary_loss_clip": 0.01086791, "auxiliary_loss_mlp": 0.01034691, "balance_loss_clip": 1.01586759, "balance_loss_mlp": 1.02595806, "epoch": 0.2380279573124906, "flos": 14245086384000.0, "grad_norm": 2.2052733126330866, "language_loss": 0.65799189, "learning_rate": 3.466752156795893e-06, "loss": 0.67920673, "num_input_tokens_seen": 85181555, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.609375, "step": 3959, "time_per_iteration": 2.356825828552246 }, { "auxiliary_loss_clip": 0.01086594, "auxiliary_loss_mlp": 0.01037715, "balance_loss_clip": 1.02021527, "balance_loss_mlp": 1.02528739, "epoch": 0.23808808056515857, "flos": 21178103927040.0, "grad_norm": 1.7360593288961228, "language_loss": 0.72312939, "learning_rate": 3.4664952782568253e-06, "loss": 0.74437243, "num_input_tokens_seen": 85199455, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.61328125, "step": 3960, "time_per_iteration": 2.375762462615967 }, { "auxiliary_loss_clip": 0.01088092, "auxiliary_loss_mlp": 0.01032354, "balance_loss_clip": 1.01455569, "balance_loss_mlp": 1.02788997, "epoch": 0.23814820381782653, "flos": 22527217751040.0, "grad_norm": 1.5031513570709232, "language_loss": 0.74265003, "learning_rate": 3.466238347382313e-06, "loss": 0.7638545, "num_input_tokens_seen": 85219170, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.6015625, "step": 3961, "time_per_iteration": 2.3914270401000977 }, { "auxiliary_loss_clip": 0.0108734, "auxiliary_loss_mlp": 0.01038095, "balance_loss_clip": 1.01850939, "balance_loss_mlp": 1.0248363, "epoch": 0.23820832707049452, "flos": 22303389824640.0, "grad_norm": 1.7264029518346724, "language_loss": 0.66661024, "learning_rate": 3.465981364181525e-06, "loss": 0.6878646, "num_input_tokens_seen": 85238480, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.625, "step": 3962, "time_per_iteration": 2.3761768341064453 }, { "auxiliary_loss_clip": 0.01086207, "auxiliary_loss_mlp": 0.0103699, "balance_loss_clip": 1.01980567, "balance_loss_mlp": 1.02590299, "epoch": 0.2382684503231625, "flos": 24863046819840.0, "grad_norm": 1.5728260116930093, "language_loss": 0.74494505, "learning_rate": 3.4657243286636332e-06, "loss": 0.766177, "num_input_tokens_seen": 85259180, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.6015625, "step": 3963, "time_per_iteration": 3.786365270614624 }, { "auxiliary_loss_clip": 0.01090366, "auxiliary_loss_mlp": 0.01036066, "balance_loss_clip": 1.01745772, "balance_loss_mlp": 1.02874279, "epoch": 0.23832857357583045, "flos": 21870536388480.0, "grad_norm": 1.8762828014026218, "language_loss": 0.77434373, "learning_rate": 3.4654672408378107e-06, "loss": 0.79560804, "num_input_tokens_seen": 85278550, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.6171875, "step": 3964, "time_per_iteration": 2.381047248840332 }, { "auxiliary_loss_clip": 0.01086723, "auxiliary_loss_mlp": 0.01035484, "balance_loss_clip": 1.01869905, "balance_loss_mlp": 1.02703702, "epoch": 0.23838869682849842, "flos": 21286998057600.0, "grad_norm": 1.821916289597902, "language_loss": 0.7098124, "learning_rate": 3.4652101007132323e-06, "loss": 0.73103452, "num_input_tokens_seen": 85297345, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.59765625, "step": 3965, "time_per_iteration": 2.37895131111145 }, { "auxiliary_loss_clip": 0.01083686, "auxiliary_loss_mlp": 0.01040786, "balance_loss_clip": 1.02383471, "balance_loss_mlp": 1.02591228, "epoch": 0.23844882008116638, "flos": 16179658663680.0, "grad_norm": 1.7384444598575801, "language_loss": 0.77981144, "learning_rate": 3.4649529082990743e-06, "loss": 0.80105615, "num_input_tokens_seen": 85315105, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.578125, "step": 3966, "time_per_iteration": 2.3644492626190186 }, { "auxiliary_loss_clip": 0.01085032, "auxiliary_loss_mlp": 0.01032293, "balance_loss_clip": 1.0161047, "balance_loss_mlp": 1.02626991, "epoch": 0.23850894333383435, "flos": 21068651214720.0, "grad_norm": 1.667296043610137, "language_loss": 0.68478042, "learning_rate": 3.4646956636045152e-06, "loss": 0.70595366, "num_input_tokens_seen": 85334735, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.5859375, "step": 3967, "time_per_iteration": 2.3833632469177246 }, { "auxiliary_loss_clip": 0.01086896, "auxiliary_loss_mlp": 0.01039739, "balance_loss_clip": 1.021667, "balance_loss_mlp": 1.02540779, "epoch": 0.23856906658650234, "flos": 17200658730240.0, "grad_norm": 1.8830429758951257, "language_loss": 0.68007058, "learning_rate": 3.4644383666387347e-06, "loss": 0.70133692, "num_input_tokens_seen": 85352875, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.6171875, "step": 3968, "time_per_iteration": 3.6999571323394775 }, { "auxiliary_loss_clip": 0.01084379, "auxiliary_loss_mlp": 0.01032561, "balance_loss_clip": 1.01556206, "balance_loss_mlp": 1.02466679, "epoch": 0.2386291898391703, "flos": 29493018927360.0, "grad_norm": 1.8585158487140065, "language_loss": 0.76495361, "learning_rate": 3.464181017410917e-06, "loss": 0.78612304, "num_input_tokens_seen": 85372205, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.59765625, "step": 3969, "time_per_iteration": 3.7654290199279785 }, { "auxiliary_loss_clip": 0.01082321, "auxiliary_loss_mlp": 0.01031209, "balance_loss_clip": 1.0156877, "balance_loss_mlp": 1.02609754, "epoch": 0.23868931309183827, "flos": 21141375310080.0, "grad_norm": 2.1294684203951957, "language_loss": 0.76286387, "learning_rate": 3.463923615930245e-06, "loss": 0.7839992, "num_input_tokens_seen": 85389705, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.5625, "step": 3970, "time_per_iteration": 2.417985439300537 }, { "auxiliary_loss_clip": 0.01085232, "auxiliary_loss_mlp": 0.01035932, "balance_loss_clip": 1.01732349, "balance_loss_mlp": 1.02476001, "epoch": 0.23874943634450624, "flos": 25658403569280.0, "grad_norm": 2.0755100414343644, "language_loss": 0.85321903, "learning_rate": 3.4636661622059042e-06, "loss": 0.87443066, "num_input_tokens_seen": 85407855, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.60546875, "step": 3971, "time_per_iteration": 2.4149224758148193 }, { "auxiliary_loss_clip": 0.01014603, "auxiliary_loss_mlp": 0.01003239, "balance_loss_clip": 1.00128365, "balance_loss_mlp": 1.00180185, "epoch": 0.2388095595971742, "flos": 58983243079680.0, "grad_norm": 0.7724852864188745, "language_loss": 0.62812436, "learning_rate": 3.4634086562470835e-06, "loss": 0.64830279, "num_input_tokens_seen": 85470885, "router_z_loss_clip": 0.01953125, "router_z_loss_mlp": 0.12792969, "step": 3972, "time_per_iteration": 3.096389055252075 }, { "auxiliary_loss_clip": 0.01086765, "auxiliary_loss_mlp": 0.01033762, "balance_loss_clip": 1.01713181, "balance_loss_mlp": 1.02610457, "epoch": 0.23886968284984217, "flos": 16799401941120.0, "grad_norm": 1.9784460532311081, "language_loss": 0.81820315, "learning_rate": 3.463151098062972e-06, "loss": 0.8394084, "num_input_tokens_seen": 85488460, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.60546875, "step": 3973, "time_per_iteration": 3.750425338745117 }, { "auxiliary_loss_clip": 0.0108626, "auxiliary_loss_mlp": 0.01039221, "balance_loss_clip": 1.0221262, "balance_loss_mlp": 1.02646852, "epoch": 0.23892980610251013, "flos": 22381560092160.0, "grad_norm": 1.5480586435699946, "language_loss": 0.79407525, "learning_rate": 3.4628934876627615e-06, "loss": 0.81533009, "num_input_tokens_seen": 85508590, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.6015625, "step": 3974, "time_per_iteration": 2.3793485164642334 }, { "auxiliary_loss_clip": 0.01085883, "auxiliary_loss_mlp": 0.01036276, "balance_loss_clip": 1.01733375, "balance_loss_mlp": 1.02519321, "epoch": 0.23898992935517813, "flos": 12822375680640.0, "grad_norm": 2.6404490165027985, "language_loss": 0.84646249, "learning_rate": 3.4626358250556458e-06, "loss": 0.86768401, "num_input_tokens_seen": 85525970, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.609375, "step": 3975, "time_per_iteration": 2.3446414470672607 }, { "auxiliary_loss_clip": 0.01084979, "auxiliary_loss_mlp": 0.01029047, "balance_loss_clip": 1.01221466, "balance_loss_mlp": 1.025787, "epoch": 0.2390500526078461, "flos": 22344587095680.0, "grad_norm": 2.070458941367138, "language_loss": 0.83398485, "learning_rate": 3.4623781102508193e-06, "loss": 0.85512519, "num_input_tokens_seen": 85543700, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.59375, "step": 3976, "time_per_iteration": 2.3877251148223877 }, { "auxiliary_loss_clip": 0.01083738, "auxiliary_loss_mlp": 0.01028535, "balance_loss_clip": 1.01259685, "balance_loss_mlp": 1.0247519, "epoch": 0.23911017586051406, "flos": 22634121934080.0, "grad_norm": 1.7267919942100842, "language_loss": 0.74378598, "learning_rate": 3.46212034325748e-06, "loss": 0.76490867, "num_input_tokens_seen": 85562765, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.58984375, "step": 3977, "time_per_iteration": 2.3767457008361816 }, { "auxiliary_loss_clip": 0.01088694, "auxiliary_loss_mlp": 0.0103766, "balance_loss_clip": 1.02079177, "balance_loss_mlp": 1.02736568, "epoch": 0.23917029911318202, "flos": 23652329091840.0, "grad_norm": 1.725665327127498, "language_loss": 0.72010094, "learning_rate": 3.4618625240848264e-06, "loss": 0.74136448, "num_input_tokens_seen": 85581755, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.61328125, "step": 3978, "time_per_iteration": 2.3961164951324463 }, { "auxiliary_loss_clip": 0.01090494, "auxiliary_loss_mlp": 0.01036339, "balance_loss_clip": 1.01925635, "balance_loss_mlp": 1.02804995, "epoch": 0.23923042236585, "flos": 22782502679040.0, "grad_norm": 2.442932903960139, "language_loss": 0.78815722, "learning_rate": 3.4616046527420597e-06, "loss": 0.80942559, "num_input_tokens_seen": 85599455, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.625, "step": 3979, "time_per_iteration": 2.384162425994873 }, { "auxiliary_loss_clip": 0.01085504, "auxiliary_loss_mlp": 0.0104775, "balance_loss_clip": 1.02966619, "balance_loss_mlp": 1.02607298, "epoch": 0.23929054561851795, "flos": 28146453632640.0, "grad_norm": 1.6629882301459435, "language_loss": 0.81845599, "learning_rate": 3.4613467292383832e-06, "loss": 0.83978856, "num_input_tokens_seen": 85619970, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.59375, "step": 3980, "time_per_iteration": 2.433626890182495 }, { "auxiliary_loss_clip": 0.01084021, "auxiliary_loss_mlp": 0.01030556, "balance_loss_clip": 1.01403391, "balance_loss_mlp": 1.02522397, "epoch": 0.23935066887118592, "flos": 21685531760640.0, "grad_norm": 1.6318570764469262, "language_loss": 0.83637077, "learning_rate": 3.4610887535830005e-06, "loss": 0.85751653, "num_input_tokens_seen": 85638850, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.5859375, "step": 3981, "time_per_iteration": 2.396655321121216 }, { "auxiliary_loss_clip": 0.01087803, "auxiliary_loss_mlp": 0.01037309, "balance_loss_clip": 1.01887965, "balance_loss_mlp": 1.02614427, "epoch": 0.2394107921238539, "flos": 32120966275200.0, "grad_norm": 1.686774687864199, "language_loss": 0.76628423, "learning_rate": 3.4608307257851186e-06, "loss": 0.78753537, "num_input_tokens_seen": 85656285, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.6171875, "step": 3982, "time_per_iteration": 2.5224783420562744 }, { "auxiliary_loss_clip": 0.01084124, "auxiliary_loss_mlp": 0.0103397, "balance_loss_clip": 1.01786518, "balance_loss_mlp": 1.02578878, "epoch": 0.23947091537652188, "flos": 17018237543040.0, "grad_norm": 1.5623356818917253, "language_loss": 0.77890348, "learning_rate": 3.460572645853946e-06, "loss": 0.80008441, "num_input_tokens_seen": 85673020, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5859375, "step": 3983, "time_per_iteration": 2.391585111618042 }, { "auxiliary_loss_clip": 0.010859, "auxiliary_loss_mlp": 0.01039087, "balance_loss_clip": 1.02035952, "balance_loss_mlp": 1.02555084, "epoch": 0.23953103862918984, "flos": 20592575648640.0, "grad_norm": 2.045453054094221, "language_loss": 0.73051536, "learning_rate": 3.4603145137986925e-06, "loss": 0.75176525, "num_input_tokens_seen": 85692565, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.6015625, "step": 3984, "time_per_iteration": 2.386662006378174 }, { "auxiliary_loss_clip": 0.01087001, "auxiliary_loss_mlp": 0.01033368, "balance_loss_clip": 1.0162853, "balance_loss_mlp": 1.02588665, "epoch": 0.2395911618818578, "flos": 20703354992640.0, "grad_norm": 2.915130429326987, "language_loss": 0.79223025, "learning_rate": 3.4600563296285704e-06, "loss": 0.81343389, "num_input_tokens_seen": 85709730, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.609375, "step": 3985, "time_per_iteration": 2.398803234100342 }, { "auxiliary_loss_clip": 0.01087986, "auxiliary_loss_mlp": 0.01040145, "balance_loss_clip": 1.02210879, "balance_loss_mlp": 1.02877975, "epoch": 0.23965128513452577, "flos": 27052275623040.0, "grad_norm": 1.7163930383911268, "language_loss": 0.73366785, "learning_rate": 3.459798093352794e-06, "loss": 0.75494915, "num_input_tokens_seen": 85730045, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.59375, "step": 3986, "time_per_iteration": 2.4272451400756836 }, { "auxiliary_loss_clip": 0.01088714, "auxiliary_loss_mlp": 0.01039063, "balance_loss_clip": 1.0219568, "balance_loss_mlp": 1.02616405, "epoch": 0.23971140838719374, "flos": 23143330247040.0, "grad_norm": 1.7421897543799214, "language_loss": 0.87564272, "learning_rate": 3.4595398049805783e-06, "loss": 0.8969205, "num_input_tokens_seen": 85747590, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.625, "step": 3987, "time_per_iteration": 2.3867604732513428 }, { "auxiliary_loss_clip": 0.01081476, "auxiliary_loss_mlp": 0.0103769, "balance_loss_clip": 1.02188313, "balance_loss_mlp": 1.02601147, "epoch": 0.23977153163986173, "flos": 18033756526080.0, "grad_norm": 2.2483503373333007, "language_loss": 0.82897425, "learning_rate": 3.459281464521142e-06, "loss": 0.8501659, "num_input_tokens_seen": 85763460, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.5546875, "step": 3988, "time_per_iteration": 2.333495855331421 }, { "auxiliary_loss_clip": 0.01085015, "auxiliary_loss_mlp": 0.01030762, "balance_loss_clip": 1.01317835, "balance_loss_mlp": 1.02549815, "epoch": 0.2398316548925297, "flos": 18112415552640.0, "grad_norm": 1.66702502877762, "language_loss": 0.85587507, "learning_rate": 3.459023071983703e-06, "loss": 0.87703288, "num_input_tokens_seen": 85782050, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.59375, "step": 3989, "time_per_iteration": 2.374199867248535 }, { "auxiliary_loss_clip": 0.0108426, "auxiliary_loss_mlp": 0.01034761, "balance_loss_clip": 1.01782095, "balance_loss_mlp": 1.0245297, "epoch": 0.23989177814519766, "flos": 12566916195840.0, "grad_norm": 2.0491048247908528, "language_loss": 0.8434158, "learning_rate": 3.458764627377484e-06, "loss": 0.86460602, "num_input_tokens_seen": 85797400, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.59765625, "step": 3990, "time_per_iteration": 2.3547685146331787 }, { "auxiliary_loss_clip": 0.01082816, "auxiliary_loss_mlp": 0.01033799, "balance_loss_clip": 1.01781273, "balance_loss_mlp": 1.02518666, "epoch": 0.23995190139786562, "flos": 25263430824960.0, "grad_norm": 1.8276982580432377, "language_loss": 0.75659311, "learning_rate": 3.458506130711708e-06, "loss": 0.77775925, "num_input_tokens_seen": 85818995, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.578125, "step": 3991, "time_per_iteration": 2.4176342487335205 }, { "auxiliary_loss_clip": 0.01016676, "auxiliary_loss_mlp": 0.01002435, "balance_loss_clip": 1.00033665, "balance_loss_mlp": 1.00356472, "epoch": 0.2400120246505336, "flos": 61957425047040.0, "grad_norm": 0.8861831914865831, "language_loss": 0.63729334, "learning_rate": 3.4582475819955995e-06, "loss": 0.65748447, "num_input_tokens_seen": 85876695, "router_z_loss_clip": 0.02099609, "router_z_loss_mlp": 0.13085938, "step": 3992, "time_per_iteration": 2.9383041858673096 }, { "auxiliary_loss_clip": 0.01015881, "auxiliary_loss_mlp": 0.01003887, "balance_loss_clip": 1.00176477, "balance_loss_mlp": 1.00301957, "epoch": 0.24007214790320155, "flos": 66705333327360.0, "grad_norm": 0.7551399710724225, "language_loss": 0.62935436, "learning_rate": 3.457988981238386e-06, "loss": 0.64955205, "num_input_tokens_seen": 85940990, "router_z_loss_clip": 0.02124023, "router_z_loss_mlp": 0.12890625, "step": 3993, "time_per_iteration": 3.1410489082336426 }, { "auxiliary_loss_clip": 0.01090296, "auxiliary_loss_mlp": 0.01040597, "balance_loss_clip": 1.02278697, "balance_loss_mlp": 1.02814472, "epoch": 0.24013227115586952, "flos": 25807971300480.0, "grad_norm": 1.4519346805010132, "language_loss": 0.76740205, "learning_rate": 3.457730328449296e-06, "loss": 0.78871101, "num_input_tokens_seen": 85961165, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.62109375, "step": 3994, "time_per_iteration": 2.4315245151519775 }, { "auxiliary_loss_clip": 0.01087546, "auxiliary_loss_mlp": 0.01044022, "balance_loss_clip": 1.02358913, "balance_loss_mlp": 1.02564549, "epoch": 0.2401923944085375, "flos": 25556282242560.0, "grad_norm": 1.8370602115244423, "language_loss": 0.7836957, "learning_rate": 3.457471623637561e-06, "loss": 0.80501139, "num_input_tokens_seen": 85982710, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.6171875, "step": 3995, "time_per_iteration": 2.4415955543518066 }, { "auxiliary_loss_clip": 0.01015788, "auxiliary_loss_mlp": 0.01010019, "balance_loss_clip": 1.00781357, "balance_loss_mlp": 1.00253904, "epoch": 0.24025251766120548, "flos": 54937751909760.0, "grad_norm": 0.9058747089563907, "language_loss": 0.63457066, "learning_rate": 3.457212866812412e-06, "loss": 0.65482873, "num_input_tokens_seen": 86046935, "router_z_loss_clip": 0.02209473, "router_z_loss_mlp": 0.1328125, "step": 3996, "time_per_iteration": 3.1012489795684814 }, { "auxiliary_loss_clip": 0.0108904, "auxiliary_loss_mlp": 0.01034865, "balance_loss_clip": 1.01726937, "balance_loss_mlp": 1.02666843, "epoch": 0.24031264091387344, "flos": 20630037404160.0, "grad_norm": 2.4189437709741135, "language_loss": 0.70645809, "learning_rate": 3.4569540579830853e-06, "loss": 0.72769713, "num_input_tokens_seen": 86064355, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.625, "step": 3997, "time_per_iteration": 2.3884925842285156 }, { "auxiliary_loss_clip": 0.01086762, "auxiliary_loss_mlp": 0.01035178, "balance_loss_clip": 1.01834548, "balance_loss_mlp": 1.02772617, "epoch": 0.2403727641665414, "flos": 20885217598080.0, "grad_norm": 1.6862980998775594, "language_loss": 0.87110609, "learning_rate": 3.456695197158815e-06, "loss": 0.89232552, "num_input_tokens_seen": 86081340, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.58984375, "step": 3998, "time_per_iteration": 2.3747923374176025 }, { "auxiliary_loss_clip": 0.01087042, "auxiliary_loss_mlp": 0.01034927, "balance_loss_clip": 1.01695061, "balance_loss_mlp": 1.02427578, "epoch": 0.24043288741920937, "flos": 22818952005120.0, "grad_norm": 1.8146267175905189, "language_loss": 0.75909293, "learning_rate": 3.4564362843488403e-06, "loss": 0.78031266, "num_input_tokens_seen": 86102260, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.62890625, "step": 3999, "time_per_iteration": 2.4066219329833984 }, { "auxiliary_loss_clip": 0.01088464, "auxiliary_loss_mlp": 0.01037473, "balance_loss_clip": 1.02061713, "balance_loss_mlp": 1.02898049, "epoch": 0.24049301067187734, "flos": 27958551361920.0, "grad_norm": 1.994489415676596, "language_loss": 0.72343725, "learning_rate": 3.4561773195624015e-06, "loss": 0.74469668, "num_input_tokens_seen": 86123400, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.59375, "step": 4000, "time_per_iteration": 2.4440486431121826 }, { "auxiliary_loss_clip": 0.01092571, "auxiliary_loss_mlp": 0.01034281, "balance_loss_clip": 1.01569605, "balance_loss_mlp": 1.0288868, "epoch": 0.24055313392454533, "flos": 27450250744320.0, "grad_norm": 1.828478248269627, "language_loss": 0.66823041, "learning_rate": 3.4559183028087394e-06, "loss": 0.6894989, "num_input_tokens_seen": 86144060, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.63671875, "step": 4001, "time_per_iteration": 2.4392964839935303 }, { "auxiliary_loss_clip": 0.0108736, "auxiliary_loss_mlp": 0.01032715, "balance_loss_clip": 1.01525068, "balance_loss_mlp": 1.02682328, "epoch": 0.2406132571772133, "flos": 25555444369920.0, "grad_norm": 2.6887092498307257, "language_loss": 0.82976794, "learning_rate": 3.4556592340970983e-06, "loss": 0.85096872, "num_input_tokens_seen": 86163005, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.60546875, "step": 4002, "time_per_iteration": 2.4116885662078857 }, { "auxiliary_loss_clip": 0.01088875, "auxiliary_loss_mlp": 0.01037585, "balance_loss_clip": 1.02040708, "balance_loss_mlp": 1.02849126, "epoch": 0.24067338042988126, "flos": 24790217990400.0, "grad_norm": 1.8993811145234223, "language_loss": 0.83022451, "learning_rate": 3.4554001134367237e-06, "loss": 0.85148919, "num_input_tokens_seen": 86182580, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.6015625, "step": 4003, "time_per_iteration": 3.7910866737365723 }, { "auxiliary_loss_clip": 0.01088616, "auxiliary_loss_mlp": 0.01030393, "balance_loss_clip": 1.01308382, "balance_loss_mlp": 1.02802551, "epoch": 0.24073350368254923, "flos": 21176882029440.0, "grad_norm": 1.9592529864304187, "language_loss": 0.87321031, "learning_rate": 3.4551409408368627e-06, "loss": 0.89440036, "num_input_tokens_seen": 86200665, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.60546875, "step": 4004, "time_per_iteration": 2.387267589569092 }, { "auxiliary_loss_clip": 0.01089989, "auxiliary_loss_mlp": 0.01047377, "balance_loss_clip": 1.02936447, "balance_loss_mlp": 1.02769518, "epoch": 0.2407936269352172, "flos": 22493142397440.0, "grad_norm": 1.7521791139733762, "language_loss": 0.78025109, "learning_rate": 3.4548817163067643e-06, "loss": 0.80162477, "num_input_tokens_seen": 86221640, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.625, "step": 4005, "time_per_iteration": 2.4593660831451416 }, { "auxiliary_loss_clip": 0.01086327, "auxiliary_loss_mlp": 0.01030745, "balance_loss_clip": 1.01314926, "balance_loss_mlp": 1.02741086, "epoch": 0.24085375018788516, "flos": 18550156579200.0, "grad_norm": 1.6009108353701194, "language_loss": 0.79030287, "learning_rate": 3.4546224398556804e-06, "loss": 0.81147361, "num_input_tokens_seen": 86240795, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.58984375, "step": 4006, "time_per_iteration": 2.377345085144043 }, { "auxiliary_loss_clip": 0.01090476, "auxiliary_loss_mlp": 0.01036911, "balance_loss_clip": 1.01683617, "balance_loss_mlp": 1.02639747, "epoch": 0.24091387344055312, "flos": 24169392460800.0, "grad_norm": 1.7063835853425924, "language_loss": 0.70962864, "learning_rate": 3.4543631114928627e-06, "loss": 0.73090243, "num_input_tokens_seen": 86262000, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 0.640625, "step": 4007, "time_per_iteration": 3.82952880859375 }, { "auxiliary_loss_clip": 0.01087084, "auxiliary_loss_mlp": 0.01033905, "balance_loss_clip": 1.01698875, "balance_loss_mlp": 1.02690339, "epoch": 0.24097399669322112, "flos": 11035520830080.0, "grad_norm": 1.7695115017731504, "language_loss": 0.76055849, "learning_rate": 3.454103731227567e-06, "loss": 0.78176832, "num_input_tokens_seen": 86279680, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.6015625, "step": 4008, "time_per_iteration": 3.7759323120117188 }, { "auxiliary_loss_clip": 0.0108743, "auxiliary_loss_mlp": 0.01035047, "balance_loss_clip": 1.01684332, "balance_loss_mlp": 1.02693796, "epoch": 0.24103411994588908, "flos": 17164139581440.0, "grad_norm": 2.4837077594938743, "language_loss": 0.74269611, "learning_rate": 3.4538442990690494e-06, "loss": 0.7639209, "num_input_tokens_seen": 86297180, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.6015625, "step": 4009, "time_per_iteration": 2.35188364982605 }, { "auxiliary_loss_clip": 0.01086402, "auxiliary_loss_mlp": 0.01034783, "balance_loss_clip": 1.01790273, "balance_loss_mlp": 1.02748787, "epoch": 0.24109424319855705, "flos": 20666905666560.0, "grad_norm": 1.6192510377422105, "language_loss": 0.80016541, "learning_rate": 3.4535848150265684e-06, "loss": 0.82137728, "num_input_tokens_seen": 86317660, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.58984375, "step": 4010, "time_per_iteration": 2.394073009490967 }, { "auxiliary_loss_clip": 0.01087002, "auxiliary_loss_mlp": 0.01038271, "balance_loss_clip": 1.01880407, "balance_loss_mlp": 1.02544391, "epoch": 0.241154366451225, "flos": 28180598808960.0, "grad_norm": 1.7287118550449876, "language_loss": 0.70196462, "learning_rate": 3.453325279109385e-06, "loss": 0.72321737, "num_input_tokens_seen": 86338325, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.6171875, "step": 4011, "time_per_iteration": 2.4279544353485107 }, { "auxiliary_loss_clip": 0.01085882, "auxiliary_loss_mlp": 0.01033038, "balance_loss_clip": 1.01656342, "balance_loss_mlp": 1.02537155, "epoch": 0.24121448970389298, "flos": 21688638871680.0, "grad_norm": 1.6895419506254177, "language_loss": 0.6934607, "learning_rate": 3.45306569132676e-06, "loss": 0.71464986, "num_input_tokens_seen": 86357615, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.60546875, "step": 4012, "time_per_iteration": 3.7878193855285645 }, { "auxiliary_loss_clip": 0.01084459, "auxiliary_loss_mlp": 0.01034459, "balance_loss_clip": 1.01620817, "balance_loss_mlp": 1.02454436, "epoch": 0.24127461295656094, "flos": 39674634791040.0, "grad_norm": 2.202969993406684, "language_loss": 0.73634732, "learning_rate": 3.4528060516879587e-06, "loss": 0.75753653, "num_input_tokens_seen": 86380355, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.6015625, "step": 4013, "time_per_iteration": 2.538541555404663 }, { "auxiliary_loss_clip": 0.01088306, "auxiliary_loss_mlp": 0.01029912, "balance_loss_clip": 1.01327085, "balance_loss_mlp": 1.02741277, "epoch": 0.2413347362092289, "flos": 19134846984960.0, "grad_norm": 2.1172922822267872, "language_loss": 0.88291633, "learning_rate": 3.4525463602022465e-06, "loss": 0.90409851, "num_input_tokens_seen": 86399125, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.609375, "step": 4014, "time_per_iteration": 2.369629144668579 }, { "auxiliary_loss_clip": 0.01089427, "auxiliary_loss_mlp": 0.01035263, "balance_loss_clip": 1.01686883, "balance_loss_mlp": 1.02668977, "epoch": 0.2413948594618969, "flos": 26938319345280.0, "grad_norm": 1.854821965618708, "language_loss": 0.94809508, "learning_rate": 3.452286616878891e-06, "loss": 0.96934193, "num_input_tokens_seen": 86418625, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.62890625, "step": 4015, "time_per_iteration": 2.415951728820801 }, { "auxiliary_loss_clip": 0.01088434, "auxiliary_loss_mlp": 0.01035645, "balance_loss_clip": 1.01816916, "balance_loss_mlp": 1.02596617, "epoch": 0.24145498271456486, "flos": 25226946587520.0, "grad_norm": 1.5234313401014352, "language_loss": 0.82684982, "learning_rate": 3.4520268217271616e-06, "loss": 0.84809065, "num_input_tokens_seen": 86438375, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.625, "step": 4016, "time_per_iteration": 2.421196222305298 }, { "auxiliary_loss_clip": 0.01085515, "auxiliary_loss_mlp": 0.01034826, "balance_loss_clip": 1.01752841, "balance_loss_mlp": 1.02704406, "epoch": 0.24151510596723283, "flos": 40660163049600.0, "grad_norm": 1.74949835870112, "language_loss": 0.68899834, "learning_rate": 3.4517669747563305e-06, "loss": 0.71020174, "num_input_tokens_seen": 86463230, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.5859375, "step": 4017, "time_per_iteration": 2.546879291534424 }, { "auxiliary_loss_clip": 0.01089601, "auxiliary_loss_mlp": 0.0103811, "balance_loss_clip": 1.01883423, "balance_loss_mlp": 1.02611876, "epoch": 0.2415752292199008, "flos": 18145792679040.0, "grad_norm": 1.6398130322302436, "language_loss": 0.8472954, "learning_rate": 3.4515070759756704e-06, "loss": 0.86857247, "num_input_tokens_seen": 86481230, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.6328125, "step": 4018, "time_per_iteration": 2.3574776649475098 }, { "auxiliary_loss_clip": 0.01016053, "auxiliary_loss_mlp": 0.01008812, "balance_loss_clip": 1.00705934, "balance_loss_mlp": 1.00344074, "epoch": 0.24163535247256876, "flos": 67285275788160.0, "grad_norm": 0.8154254800086314, "language_loss": 0.60669744, "learning_rate": 3.4512471253944563e-06, "loss": 0.62694609, "num_input_tokens_seen": 86541260, "router_z_loss_clip": 0.01757812, "router_z_loss_mlp": 0.12597656, "step": 4019, "time_per_iteration": 3.075002908706665 }, { "auxiliary_loss_clip": 0.01085485, "auxiliary_loss_mlp": 0.01031994, "balance_loss_clip": 1.01525688, "balance_loss_mlp": 1.02541828, "epoch": 0.24169547572523672, "flos": 24928963200000.0, "grad_norm": 1.7501434214385798, "language_loss": 0.73478138, "learning_rate": 3.4509871230219653e-06, "loss": 0.75595617, "num_input_tokens_seen": 86559580, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.6015625, "step": 4020, "time_per_iteration": 2.409864664077759 }, { "auxiliary_loss_clip": 0.01088366, "auxiliary_loss_mlp": 0.01033731, "balance_loss_clip": 1.01759028, "balance_loss_mlp": 1.02827859, "epoch": 0.24175559897790472, "flos": 18727480707840.0, "grad_norm": 5.2569282255920795, "language_loss": 0.81802928, "learning_rate": 3.4507270688674767e-06, "loss": 0.83925021, "num_input_tokens_seen": 86577560, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.6015625, "step": 4021, "time_per_iteration": 2.3730390071868896 }, { "auxiliary_loss_clip": 0.0109031, "auxiliary_loss_mlp": 0.01039498, "balance_loss_clip": 1.02059174, "balance_loss_mlp": 1.02699518, "epoch": 0.24181572223057268, "flos": 23038171632000.0, "grad_norm": 1.8459056461712018, "language_loss": 0.7628122, "learning_rate": 3.45046696294027e-06, "loss": 0.78411031, "num_input_tokens_seen": 86595350, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.6328125, "step": 4022, "time_per_iteration": 2.4485411643981934 }, { "auxiliary_loss_clip": 0.01089441, "auxiliary_loss_mlp": 0.01044885, "balance_loss_clip": 1.02590644, "balance_loss_mlp": 1.02579141, "epoch": 0.24187584548324065, "flos": 20375101589760.0, "grad_norm": 1.6864067988128646, "language_loss": 0.74856055, "learning_rate": 3.4502068052496283e-06, "loss": 0.76990384, "num_input_tokens_seen": 86614805, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.63671875, "step": 4023, "time_per_iteration": 2.3793880939483643 }, { "auxiliary_loss_clip": 0.01086206, "auxiliary_loss_mlp": 0.0104086, "balance_loss_clip": 1.02359855, "balance_loss_mlp": 1.02751613, "epoch": 0.2419359687359086, "flos": 21396450769920.0, "grad_norm": 1.810165434136769, "language_loss": 0.82164931, "learning_rate": 3.449946595804837e-06, "loss": 0.84292001, "num_input_tokens_seen": 86633700, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.5859375, "step": 4024, "time_per_iteration": 2.3825526237487793 }, { "auxiliary_loss_clip": 0.01084842, "auxiliary_loss_mlp": 0.01037382, "balance_loss_clip": 1.01990604, "balance_loss_mlp": 1.02621841, "epoch": 0.24199609198857658, "flos": 18368398707840.0, "grad_norm": 1.6964064599763604, "language_loss": 0.86108863, "learning_rate": 3.4496863346151805e-06, "loss": 0.88231087, "num_input_tokens_seen": 86650905, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.5859375, "step": 4025, "time_per_iteration": 2.3538613319396973 }, { "auxiliary_loss_clip": 0.01088574, "auxiliary_loss_mlp": 0.01045561, "balance_loss_clip": 1.02785802, "balance_loss_mlp": 1.02560258, "epoch": 0.24205621524124454, "flos": 19462856008320.0, "grad_norm": 1.9397013807255439, "language_loss": 0.71698177, "learning_rate": 3.449426021689949e-06, "loss": 0.73832315, "num_input_tokens_seen": 86669185, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.62890625, "step": 4026, "time_per_iteration": 2.3856053352355957 }, { "auxiliary_loss_clip": 0.01085522, "auxiliary_loss_mlp": 0.01035277, "balance_loss_clip": 1.01926684, "balance_loss_mlp": 1.02678347, "epoch": 0.2421163384939125, "flos": 14975434448640.0, "grad_norm": 1.8605379154607802, "language_loss": 0.64380479, "learning_rate": 3.44916565703843e-06, "loss": 0.66501278, "num_input_tokens_seen": 86686805, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5859375, "step": 4027, "time_per_iteration": 2.3626890182495117 }, { "auxiliary_loss_clip": 0.01088122, "auxiliary_loss_mlp": 0.01033518, "balance_loss_clip": 1.01759148, "balance_loss_mlp": 1.02771604, "epoch": 0.2421764617465805, "flos": 18661040657280.0, "grad_norm": 2.0249456577952, "language_loss": 0.70523262, "learning_rate": 3.4489052406699167e-06, "loss": 0.72644901, "num_input_tokens_seen": 86705520, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.6015625, "step": 4028, "time_per_iteration": 2.4186532497406006 }, { "auxiliary_loss_clip": 0.01014611, "auxiliary_loss_mlp": 0.01008285, "balance_loss_clip": 1.00647306, "balance_loss_mlp": 1.00195169, "epoch": 0.24223658499924847, "flos": 64343877454080.0, "grad_norm": 0.8748438877926056, "language_loss": 0.55332172, "learning_rate": 3.4486447725937024e-06, "loss": 0.57355064, "num_input_tokens_seen": 86767320, "router_z_loss_clip": 0.01806641, "router_z_loss_mlp": 0.12695312, "step": 4029, "time_per_iteration": 3.024845600128174 }, { "auxiliary_loss_clip": 0.01086545, "auxiliary_loss_mlp": 0.01034274, "balance_loss_clip": 1.01610661, "balance_loss_mlp": 1.02595139, "epoch": 0.24229670825191643, "flos": 25774070503680.0, "grad_norm": 3.093962376183483, "language_loss": 0.73978829, "learning_rate": 3.448384252819083e-06, "loss": 0.7609964, "num_input_tokens_seen": 86788110, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.609375, "step": 4030, "time_per_iteration": 2.437941789627075 }, { "auxiliary_loss_clip": 0.0108547, "auxiliary_loss_mlp": 0.01039609, "balance_loss_clip": 1.02177572, "balance_loss_mlp": 1.02543759, "epoch": 0.2423568315045844, "flos": 20666067793920.0, "grad_norm": 1.906826339171084, "language_loss": 0.76540571, "learning_rate": 3.4481236813553544e-06, "loss": 0.7866565, "num_input_tokens_seen": 86807640, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.6015625, "step": 4031, "time_per_iteration": 2.4073100090026855 }, { "auxiliary_loss_clip": 0.01088417, "auxiliary_loss_mlp": 0.01036159, "balance_loss_clip": 1.01839709, "balance_loss_mlp": 1.02473474, "epoch": 0.24241695475725236, "flos": 22415775091200.0, "grad_norm": 2.01946046040732, "language_loss": 0.65303868, "learning_rate": 3.447863058211817e-06, "loss": 0.67428446, "num_input_tokens_seen": 86826795, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.63671875, "step": 4032, "time_per_iteration": 2.392939329147339 }, { "auxiliary_loss_clip": 0.01084638, "auxiliary_loss_mlp": 0.01033699, "balance_loss_clip": 1.01661599, "balance_loss_mlp": 1.02461696, "epoch": 0.24247707800992033, "flos": 17128039368960.0, "grad_norm": 2.019517899431195, "language_loss": 0.81508386, "learning_rate": 3.447602383397772e-06, "loss": 0.83626723, "num_input_tokens_seen": 86843175, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.6015625, "step": 4033, "time_per_iteration": 2.3657474517822266 }, { "auxiliary_loss_clip": 0.01082597, "auxiliary_loss_mlp": 0.01030679, "balance_loss_clip": 1.01432991, "balance_loss_mlp": 1.02487993, "epoch": 0.2425372012625883, "flos": 31612386366720.0, "grad_norm": 2.0192584786228083, "language_loss": 0.69367337, "learning_rate": 3.447341656922521e-06, "loss": 0.71480614, "num_input_tokens_seen": 86863185, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.578125, "step": 4034, "time_per_iteration": 2.4704880714416504 }, { "auxiliary_loss_clip": 0.01084263, "auxiliary_loss_mlp": 0.01031035, "balance_loss_clip": 1.01366591, "balance_loss_mlp": 1.02409565, "epoch": 0.24259732451525629, "flos": 16325106854400.0, "grad_norm": 3.8398012747057058, "language_loss": 0.96217966, "learning_rate": 3.4470808787953693e-06, "loss": 0.98333263, "num_input_tokens_seen": 86880040, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.6015625, "step": 4035, "time_per_iteration": 2.3749608993530273 }, { "auxiliary_loss_clip": 0.01080649, "auxiliary_loss_mlp": 0.01035368, "balance_loss_clip": 1.01975203, "balance_loss_mlp": 1.02442312, "epoch": 0.24265744776792425, "flos": 22855540976640.0, "grad_norm": 1.5507496232106701, "language_loss": 0.77687532, "learning_rate": 3.4468200490256236e-06, "loss": 0.7980355, "num_input_tokens_seen": 86900610, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.5625, "step": 4036, "time_per_iteration": 2.391878366470337 }, { "auxiliary_loss_clip": 0.01085758, "auxiliary_loss_mlp": 0.01036933, "balance_loss_clip": 1.01980233, "balance_loss_mlp": 1.02556109, "epoch": 0.24271757102059222, "flos": 21870501477120.0, "grad_norm": 1.6914231535311977, "language_loss": 0.74482965, "learning_rate": 3.4465591676225916e-06, "loss": 0.76605654, "num_input_tokens_seen": 86919385, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.6015625, "step": 4037, "time_per_iteration": 2.3816864490509033 }, { "auxiliary_loss_clip": 0.01087143, "auxiliary_loss_mlp": 0.01035193, "balance_loss_clip": 1.01744318, "balance_loss_mlp": 1.02559233, "epoch": 0.24277769427326018, "flos": 19207571080320.0, "grad_norm": 2.739470830703685, "language_loss": 0.76286781, "learning_rate": 3.446298234595584e-06, "loss": 0.78409111, "num_input_tokens_seen": 86938885, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.6171875, "step": 4038, "time_per_iteration": 2.3826780319213867 }, { "auxiliary_loss_clip": 0.01087325, "auxiliary_loss_mlp": 0.01037104, "balance_loss_clip": 1.01804256, "balance_loss_mlp": 1.02585304, "epoch": 0.24283781752592815, "flos": 19498886398080.0, "grad_norm": 1.621592340513611, "language_loss": 0.71952415, "learning_rate": 3.4460372499539133e-06, "loss": 0.74076843, "num_input_tokens_seen": 86957705, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.61328125, "step": 4039, "time_per_iteration": 2.373342990875244 }, { "auxiliary_loss_clip": 0.01086069, "auxiliary_loss_mlp": 0.01043094, "balance_loss_clip": 1.02503335, "balance_loss_mlp": 1.02581549, "epoch": 0.2428979407785961, "flos": 19901155616640.0, "grad_norm": 1.665761264889598, "language_loss": 0.78207022, "learning_rate": 3.4457762137068923e-06, "loss": 0.80336183, "num_input_tokens_seen": 86975845, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.6015625, "step": 4040, "time_per_iteration": 2.3768017292022705 }, { "auxiliary_loss_clip": 0.01079687, "auxiliary_loss_mlp": 0.01029417, "balance_loss_clip": 1.01310945, "balance_loss_mlp": 1.02332282, "epoch": 0.2429580640312641, "flos": 24714770808960.0, "grad_norm": 2.7083814955089975, "language_loss": 0.805282, "learning_rate": 3.4455151258638377e-06, "loss": 0.8263731, "num_input_tokens_seen": 86994800, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.5625, "step": 4041, "time_per_iteration": 2.411120653152466 }, { "auxiliary_loss_clip": 0.01084251, "auxiliary_loss_mlp": 0.0103811, "balance_loss_clip": 1.02089632, "balance_loss_mlp": 1.0257473, "epoch": 0.24301818728393207, "flos": 25629145983360.0, "grad_norm": 1.9368423125732326, "language_loss": 0.7684803, "learning_rate": 3.445253986434066e-06, "loss": 0.78970385, "num_input_tokens_seen": 87016845, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.5859375, "step": 4042, "time_per_iteration": 3.937272548675537 }, { "auxiliary_loss_clip": 0.01083316, "auxiliary_loss_mlp": 0.01030303, "balance_loss_clip": 1.01465082, "balance_loss_mlp": 1.02557111, "epoch": 0.24307831053660003, "flos": 26140169687040.0, "grad_norm": 1.6918458469240896, "language_loss": 0.81556666, "learning_rate": 3.4449927954268977e-06, "loss": 0.83670294, "num_input_tokens_seen": 87036270, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.578125, "step": 4043, "time_per_iteration": 2.4226481914520264 }, { "auxiliary_loss_clip": 0.01085304, "auxiliary_loss_mlp": 0.01033457, "balance_loss_clip": 1.01509857, "balance_loss_mlp": 1.02388227, "epoch": 0.243138433789268, "flos": 14971629110400.0, "grad_norm": 2.114715848266188, "language_loss": 0.72919255, "learning_rate": 3.444731552851653e-06, "loss": 0.75038016, "num_input_tokens_seen": 87049920, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.61328125, "step": 4044, "time_per_iteration": 2.3478853702545166 }, { "auxiliary_loss_clip": 0.01087591, "auxiliary_loss_mlp": 0.01036146, "balance_loss_clip": 1.01803792, "balance_loss_mlp": 1.02715635, "epoch": 0.24319855704193596, "flos": 25190532172800.0, "grad_norm": 1.7308929888142233, "language_loss": 0.83334029, "learning_rate": 3.4444702587176556e-06, "loss": 0.85457766, "num_input_tokens_seen": 87068230, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.6015625, "step": 4045, "time_per_iteration": 2.4102683067321777 }, { "auxiliary_loss_clip": 0.01088269, "auxiliary_loss_mlp": 0.01034224, "balance_loss_clip": 1.01668811, "balance_loss_mlp": 1.02836776, "epoch": 0.24325868029460393, "flos": 22126135518720.0, "grad_norm": 1.5821011833520624, "language_loss": 0.86685628, "learning_rate": 3.4442089130342303e-06, "loss": 0.88808119, "num_input_tokens_seen": 87086435, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.59765625, "step": 4046, "time_per_iteration": 2.3933961391448975 }, { "auxiliary_loss_clip": 0.01081384, "auxiliary_loss_mlp": 0.01030579, "balance_loss_clip": 1.01439667, "balance_loss_mlp": 1.02443242, "epoch": 0.2433188035472719, "flos": 23581106184960.0, "grad_norm": 1.7697531404225275, "language_loss": 0.72610867, "learning_rate": 3.443947515810704e-06, "loss": 0.74722838, "num_input_tokens_seen": 87105340, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.5703125, "step": 4047, "time_per_iteration": 3.804840087890625 }, { "auxiliary_loss_clip": 0.01085507, "auxiliary_loss_mlp": 0.01029184, "balance_loss_clip": 1.01139784, "balance_loss_mlp": 1.02518833, "epoch": 0.2433789267999399, "flos": 24461650385280.0, "grad_norm": 2.5013067149024284, "language_loss": 0.73022771, "learning_rate": 3.4436860670564053e-06, "loss": 0.7513746, "num_input_tokens_seen": 87125780, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.6015625, "step": 4048, "time_per_iteration": 3.843885898590088 }, { "auxiliary_loss_clip": 0.01083257, "auxiliary_loss_mlp": 0.01034456, "balance_loss_clip": 1.01883984, "balance_loss_mlp": 1.02464604, "epoch": 0.24343905005260785, "flos": 16726957136640.0, "grad_norm": 1.7660263379588135, "language_loss": 0.73191977, "learning_rate": 3.443424566780664e-06, "loss": 0.75309694, "num_input_tokens_seen": 87144470, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.5859375, "step": 4049, "time_per_iteration": 2.374965190887451 }, { "auxiliary_loss_clip": 0.01081639, "auxiliary_loss_mlp": 0.01031915, "balance_loss_clip": 1.01581013, "balance_loss_mlp": 1.02405167, "epoch": 0.24349917330527582, "flos": 20042833380480.0, "grad_norm": 1.6627961175457093, "language_loss": 0.73763454, "learning_rate": 3.4431630149928126e-06, "loss": 0.75877011, "num_input_tokens_seen": 87162830, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.57421875, "step": 4050, "time_per_iteration": 2.358772039413452 }, { "auxiliary_loss_clip": 0.0108224, "auxiliary_loss_mlp": 0.01028995, "balance_loss_clip": 1.01356351, "balance_loss_mlp": 1.02507544, "epoch": 0.24355929655794378, "flos": 17419599066240.0, "grad_norm": 2.8060132166891796, "language_loss": 0.74937081, "learning_rate": 3.442901411702186e-06, "loss": 0.77048314, "num_input_tokens_seen": 87180905, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.5703125, "step": 4051, "time_per_iteration": 2.374464750289917 }, { "auxiliary_loss_clip": 0.01081998, "auxiliary_loss_mlp": 0.01033713, "balance_loss_clip": 1.01667809, "balance_loss_mlp": 1.023803, "epoch": 0.24361941981061175, "flos": 25409751799680.0, "grad_norm": 2.1546092405486124, "language_loss": 0.70442474, "learning_rate": 3.44263975691812e-06, "loss": 0.72558182, "num_input_tokens_seen": 87202290, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.578125, "step": 4052, "time_per_iteration": 3.8116774559020996 }, { "auxiliary_loss_clip": 0.01084685, "auxiliary_loss_mlp": 0.0103469, "balance_loss_clip": 1.0171299, "balance_loss_mlp": 1.02413774, "epoch": 0.2436795430632797, "flos": 22819685143680.0, "grad_norm": 1.5679740817017886, "language_loss": 0.80948436, "learning_rate": 3.4423780506499513e-06, "loss": 0.83067811, "num_input_tokens_seen": 87221650, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.60546875, "step": 4053, "time_per_iteration": 2.396677017211914 }, { "auxiliary_loss_clip": 0.01085847, "auxiliary_loss_mlp": 0.01034954, "balance_loss_clip": 1.01727521, "balance_loss_mlp": 1.02519655, "epoch": 0.2437396663159477, "flos": 15156913029120.0, "grad_norm": 1.5736054686827436, "language_loss": 0.78104687, "learning_rate": 3.44211629290702e-06, "loss": 0.80225492, "num_input_tokens_seen": 87238515, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.60546875, "step": 4054, "time_per_iteration": 2.352160930633545 }, { "auxiliary_loss_clip": 0.01083136, "auxiliary_loss_mlp": 0.01040874, "balance_loss_clip": 1.02467966, "balance_loss_mlp": 1.02459168, "epoch": 0.24379978956861567, "flos": 22090035306240.0, "grad_norm": 1.711466330754344, "language_loss": 0.8373248, "learning_rate": 3.441854483698668e-06, "loss": 0.85856485, "num_input_tokens_seen": 87256290, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.5859375, "step": 4055, "time_per_iteration": 2.4138126373291016 }, { "auxiliary_loss_clip": 0.01087056, "auxiliary_loss_mlp": 0.01032593, "balance_loss_clip": 1.01502132, "balance_loss_mlp": 1.02472568, "epoch": 0.24385991282128364, "flos": 31466414505600.0, "grad_norm": 2.374355288857586, "language_loss": 0.54764944, "learning_rate": 3.441592623034239e-06, "loss": 0.56884593, "num_input_tokens_seen": 87277085, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.625, "step": 4056, "time_per_iteration": 2.4502336978912354 }, { "auxiliary_loss_clip": 0.01087786, "auxiliary_loss_mlp": 0.01043258, "balance_loss_clip": 1.02503085, "balance_loss_mlp": 1.02590561, "epoch": 0.2439200360739516, "flos": 23837752656000.0, "grad_norm": 2.9310997272899773, "language_loss": 0.80442935, "learning_rate": 3.4413307109230772e-06, "loss": 0.8257398, "num_input_tokens_seen": 87293020, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.6171875, "step": 4057, "time_per_iteration": 2.383955240249634 }, { "auxiliary_loss_clip": 0.01083531, "auxiliary_loss_mlp": 0.01033016, "balance_loss_clip": 1.01710176, "balance_loss_mlp": 1.02569675, "epoch": 0.24398015932661957, "flos": 19169027072640.0, "grad_norm": 1.657710442182054, "language_loss": 0.7917918, "learning_rate": 3.44106874737453e-06, "loss": 0.81295723, "num_input_tokens_seen": 87311445, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.578125, "step": 4058, "time_per_iteration": 2.3650920391082764 }, { "auxiliary_loss_clip": 0.01087073, "auxiliary_loss_mlp": 0.01036755, "balance_loss_clip": 1.01982701, "balance_loss_mlp": 1.02571154, "epoch": 0.24404028257928753, "flos": 25261371054720.0, "grad_norm": 1.5991901212591748, "language_loss": 0.85366106, "learning_rate": 3.440806732397945e-06, "loss": 0.87489927, "num_input_tokens_seen": 87332055, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.61328125, "step": 4059, "time_per_iteration": 2.430112600326538 }, { "auxiliary_loss_clip": 0.01082426, "auxiliary_loss_mlp": 0.01032771, "balance_loss_clip": 1.0167253, "balance_loss_mlp": 1.02541184, "epoch": 0.2441004058319555, "flos": 26466433142400.0, "grad_norm": 1.5466199952626698, "language_loss": 0.74175167, "learning_rate": 3.4405446660026753e-06, "loss": 0.76290363, "num_input_tokens_seen": 87351295, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5703125, "step": 4060, "time_per_iteration": 2.416564464569092 }, { "auxiliary_loss_clip": 0.01089481, "auxiliary_loss_mlp": 0.01043823, "balance_loss_clip": 1.0230453, "balance_loss_mlp": 1.02646375, "epoch": 0.2441605290846235, "flos": 26759319471360.0, "grad_norm": 1.666582484146722, "language_loss": 0.73178411, "learning_rate": 3.4402825481980707e-06, "loss": 0.7531172, "num_input_tokens_seen": 87370650, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 0.6328125, "step": 4061, "time_per_iteration": 2.4373011589050293 }, { "auxiliary_loss_clip": 0.01084766, "auxiliary_loss_mlp": 0.01034174, "balance_loss_clip": 1.01852179, "balance_loss_mlp": 1.02573442, "epoch": 0.24422065233729146, "flos": 21104786338560.0, "grad_norm": 1.6867490590711338, "language_loss": 0.7636472, "learning_rate": 3.4400203789934876e-06, "loss": 0.78483665, "num_input_tokens_seen": 87389020, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.58984375, "step": 4062, "time_per_iteration": 2.389997959136963 }, { "auxiliary_loss_clip": 0.01083111, "auxiliary_loss_mlp": 0.01034092, "balance_loss_clip": 1.0172472, "balance_loss_mlp": 1.02626681, "epoch": 0.24428077558995942, "flos": 25262034370560.0, "grad_norm": 1.5326945868476514, "language_loss": 0.85370314, "learning_rate": 3.4397581583982814e-06, "loss": 0.87487519, "num_input_tokens_seen": 87409695, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.56640625, "step": 4063, "time_per_iteration": 2.4098868370056152 }, { "auxiliary_loss_clip": 0.01087889, "auxiliary_loss_mlp": 0.01032209, "balance_loss_clip": 1.01442289, "balance_loss_mlp": 1.02639985, "epoch": 0.24434089884262739, "flos": 20484240099840.0, "grad_norm": 2.3194674917998577, "language_loss": 0.6861009, "learning_rate": 3.43949588642181e-06, "loss": 0.70730186, "num_input_tokens_seen": 87428250, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.61328125, "step": 4064, "time_per_iteration": 2.369448661804199 }, { "auxiliary_loss_clip": 0.01090568, "auxiliary_loss_mlp": 0.01032053, "balance_loss_clip": 1.01345634, "balance_loss_mlp": 1.0276202, "epoch": 0.24440102209529535, "flos": 23620802267520.0, "grad_norm": 1.7129569413561863, "language_loss": 0.70268422, "learning_rate": 3.439233563073433e-06, "loss": 0.72391045, "num_input_tokens_seen": 87449380, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.62890625, "step": 4065, "time_per_iteration": 2.415884017944336 }, { "auxiliary_loss_clip": 0.0109049, "auxiliary_loss_mlp": 0.01039149, "balance_loss_clip": 1.01882362, "balance_loss_mlp": 1.0269953, "epoch": 0.24446114534796332, "flos": 20553787261440.0, "grad_norm": 1.8505396824588383, "language_loss": 0.83956051, "learning_rate": 3.4389711883625124e-06, "loss": 0.86085689, "num_input_tokens_seen": 87465365, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.6328125, "step": 4066, "time_per_iteration": 2.3772501945495605 }, { "auxiliary_loss_clip": 0.01018032, "auxiliary_loss_mlp": 0.01005863, "balance_loss_clip": 1.00388443, "balance_loss_mlp": 1.00498223, "epoch": 0.24452126860063128, "flos": 60386717623680.0, "grad_norm": 0.9313977616385747, "language_loss": 0.52255923, "learning_rate": 3.4387087622984114e-06, "loss": 0.54279816, "num_input_tokens_seen": 87522525, "router_z_loss_clip": 0.01977539, "router_z_loss_mlp": 0.13085938, "step": 4067, "time_per_iteration": 2.9381279945373535 }, { "auxiliary_loss_clip": 0.01088623, "auxiliary_loss_mlp": 0.01040489, "balance_loss_clip": 1.02146351, "balance_loss_mlp": 1.02648866, "epoch": 0.24458139185329927, "flos": 15120777905280.0, "grad_norm": 2.6971374172915916, "language_loss": 0.72052568, "learning_rate": 3.4384462848904956e-06, "loss": 0.74181682, "num_input_tokens_seen": 87539170, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.62109375, "step": 4068, "time_per_iteration": 2.349055767059326 }, { "auxiliary_loss_clip": 0.01087688, "auxiliary_loss_mlp": 0.01039753, "balance_loss_clip": 1.02128696, "balance_loss_mlp": 1.02718902, "epoch": 0.24464151510596724, "flos": 27997549217280.0, "grad_norm": 1.942589941292615, "language_loss": 0.77926159, "learning_rate": 3.438183756148132e-06, "loss": 0.80053604, "num_input_tokens_seen": 87558875, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.60546875, "step": 4069, "time_per_iteration": 2.4349539279937744 }, { "auxiliary_loss_clip": 0.01088186, "auxiliary_loss_mlp": 0.01040825, "balance_loss_clip": 1.02281272, "balance_loss_mlp": 1.02837825, "epoch": 0.2447016383586352, "flos": 19791842549760.0, "grad_norm": 1.8921631164805177, "language_loss": 0.80191195, "learning_rate": 3.4379211760806895e-06, "loss": 0.82320201, "num_input_tokens_seen": 87576485, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.59765625, "step": 4070, "time_per_iteration": 2.3638057708740234 }, { "auxiliary_loss_clip": 0.0108554, "auxiliary_loss_mlp": 0.01032592, "balance_loss_clip": 1.01531875, "balance_loss_mlp": 1.02607942, "epoch": 0.24476176161130317, "flos": 26066153871360.0, "grad_norm": 1.5368435014631852, "language_loss": 0.84227765, "learning_rate": 3.4376585446975394e-06, "loss": 0.86345899, "num_input_tokens_seen": 87598620, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.59375, "step": 4071, "time_per_iteration": 2.4472923278808594 }, { "auxiliary_loss_clip": 0.01089303, "auxiliary_loss_mlp": 0.01039843, "balance_loss_clip": 1.02109122, "balance_loss_mlp": 1.02579284, "epoch": 0.24482188486397113, "flos": 18842554149120.0, "grad_norm": 1.9747597626384026, "language_loss": 0.80001962, "learning_rate": 3.4373958620080535e-06, "loss": 0.82131112, "num_input_tokens_seen": 87616595, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.6328125, "step": 4072, "time_per_iteration": 2.3589060306549072 }, { "auxiliary_loss_clip": 0.01086477, "auxiliary_loss_mlp": 0.01040998, "balance_loss_clip": 1.02470779, "balance_loss_mlp": 1.02635396, "epoch": 0.2448820081166391, "flos": 21250723288320.0, "grad_norm": 1.4779839136236708, "language_loss": 0.70185995, "learning_rate": 3.437133128021607e-06, "loss": 0.7231347, "num_input_tokens_seen": 87635755, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.6015625, "step": 4073, "time_per_iteration": 2.378868341445923 }, { "auxiliary_loss_clip": 0.01084447, "auxiliary_loss_mlp": 0.01034579, "balance_loss_clip": 1.01822329, "balance_loss_mlp": 1.02527618, "epoch": 0.2449421313693071, "flos": 23949474606720.0, "grad_norm": 1.9629998189776336, "language_loss": 0.67284667, "learning_rate": 3.436870342747576e-06, "loss": 0.69403696, "num_input_tokens_seen": 87652885, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.58984375, "step": 4074, "time_per_iteration": 2.404540538787842 }, { "auxiliary_loss_clip": 0.0101472, "auxiliary_loss_mlp": 0.0100345, "balance_loss_clip": 1.00168526, "balance_loss_mlp": 1.00193989, "epoch": 0.24500225462197506, "flos": 60684631188480.0, "grad_norm": 0.8952262686954836, "language_loss": 0.68701637, "learning_rate": 3.4366075061953383e-06, "loss": 0.70719802, "num_input_tokens_seen": 87713220, "router_z_loss_clip": 0.0177002, "router_z_loss_mlp": 0.12695312, "step": 4075, "time_per_iteration": 3.095064878463745 }, { "auxiliary_loss_clip": 0.01086011, "auxiliary_loss_mlp": 0.01034644, "balance_loss_clip": 1.01684546, "balance_loss_mlp": 1.02662492, "epoch": 0.24506237787464302, "flos": 26283069348480.0, "grad_norm": 1.7342231317190084, "language_loss": 0.79356635, "learning_rate": 3.4363446183742745e-06, "loss": 0.81477296, "num_input_tokens_seen": 87732680, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.59375, "step": 4076, "time_per_iteration": 2.4168450832366943 }, { "auxiliary_loss_clip": 0.01090587, "auxiliary_loss_mlp": 0.01037191, "balance_loss_clip": 1.01723552, "balance_loss_mlp": 1.02680612, "epoch": 0.245122501127311, "flos": 20551413288960.0, "grad_norm": 1.7433185223663015, "language_loss": 0.81870311, "learning_rate": 3.436081679293765e-06, "loss": 0.83998084, "num_input_tokens_seen": 87751880, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.640625, "step": 4077, "time_per_iteration": 2.394547700881958 }, { "auxiliary_loss_clip": 0.01086478, "auxiliary_loss_mlp": 0.0103991, "balance_loss_clip": 1.0213728, "balance_loss_mlp": 1.02566075, "epoch": 0.24518262437997895, "flos": 29131318575360.0, "grad_norm": 1.9140340180836666, "language_loss": 0.6226418, "learning_rate": 3.435818688963195e-06, "loss": 0.64390564, "num_input_tokens_seen": 87771795, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.609375, "step": 4078, "time_per_iteration": 2.4321401119232178 }, { "auxiliary_loss_clip": 0.01085197, "auxiliary_loss_mlp": 0.01030136, "balance_loss_clip": 1.01416183, "balance_loss_mlp": 1.02681994, "epoch": 0.24524274763264692, "flos": 23475807924480.0, "grad_norm": 1.5585717701530637, "language_loss": 0.75791383, "learning_rate": 3.4355556473919496e-06, "loss": 0.77906722, "num_input_tokens_seen": 87793640, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5859375, "step": 4079, "time_per_iteration": 2.4183995723724365 }, { "auxiliary_loss_clip": 0.01084062, "auxiliary_loss_mlp": 0.01039861, "balance_loss_clip": 1.0213238, "balance_loss_mlp": 1.02516735, "epoch": 0.24530287088531488, "flos": 17200239793920.0, "grad_norm": 1.632000518799865, "language_loss": 0.74624711, "learning_rate": 3.4352925545894158e-06, "loss": 0.76748633, "num_input_tokens_seen": 87812390, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.5859375, "step": 4080, "time_per_iteration": 2.383319139480591 }, { "auxiliary_loss_clip": 0.01082398, "auxiliary_loss_mlp": 0.01029089, "balance_loss_clip": 1.0119822, "balance_loss_mlp": 1.02472401, "epoch": 0.24536299413798288, "flos": 14866540318080.0, "grad_norm": 1.7022497527737606, "language_loss": 0.82644224, "learning_rate": 3.4350294105649823e-06, "loss": 0.84755707, "num_input_tokens_seen": 87830640, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.57421875, "step": 4081, "time_per_iteration": 3.764127731323242 }, { "auxiliary_loss_clip": 0.01084602, "auxiliary_loss_mlp": 0.01039664, "balance_loss_clip": 1.02171159, "balance_loss_mlp": 1.02598345, "epoch": 0.24542311739065084, "flos": 35260600642560.0, "grad_norm": 2.0308930023785985, "language_loss": 0.73408455, "learning_rate": 3.4347662153280407e-06, "loss": 0.75532722, "num_input_tokens_seen": 87850450, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.5859375, "step": 4082, "time_per_iteration": 2.5416347980499268 }, { "auxiliary_loss_clip": 0.01082989, "auxiliary_loss_mlp": 0.01039222, "balance_loss_clip": 1.02315247, "balance_loss_mlp": 1.02551281, "epoch": 0.2454832406433188, "flos": 21502167966720.0, "grad_norm": 1.7776691365028165, "language_loss": 0.71883935, "learning_rate": 3.4345029688879837e-06, "loss": 0.74006146, "num_input_tokens_seen": 87868810, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.57421875, "step": 4083, "time_per_iteration": 2.378974676132202 }, { "auxiliary_loss_clip": 0.01087814, "auxiliary_loss_mlp": 0.01037797, "balance_loss_clip": 1.01862836, "balance_loss_mlp": 1.02536213, "epoch": 0.24554336389598677, "flos": 14755795885440.0, "grad_norm": 1.9786899534505793, "language_loss": 0.74808884, "learning_rate": 3.4342396712542057e-06, "loss": 0.76934499, "num_input_tokens_seen": 87885685, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.625, "step": 4084, "time_per_iteration": 2.385164976119995 }, { "auxiliary_loss_clip": 0.01085357, "auxiliary_loss_mlp": 0.0103101, "balance_loss_clip": 1.01353431, "balance_loss_mlp": 1.02561307, "epoch": 0.24560348714865474, "flos": 14975504271360.0, "grad_norm": 2.325602264681821, "language_loss": 0.85318172, "learning_rate": 3.433976322436103e-06, "loss": 0.87434542, "num_input_tokens_seen": 87903715, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.59765625, "step": 4085, "time_per_iteration": 2.3711678981781006 }, { "auxiliary_loss_clip": 0.01085579, "auxiliary_loss_mlp": 0.01039234, "balance_loss_clip": 1.0209831, "balance_loss_mlp": 1.02607286, "epoch": 0.2456636104013227, "flos": 22674202041600.0, "grad_norm": 1.657697617162889, "language_loss": 0.79277724, "learning_rate": 3.433712922443074e-06, "loss": 0.8140254, "num_input_tokens_seen": 87923375, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.59375, "step": 4086, "time_per_iteration": 3.832991361618042 }, { "auxiliary_loss_clip": 0.01083081, "auxiliary_loss_mlp": 0.01034732, "balance_loss_clip": 1.01735139, "balance_loss_mlp": 1.02670062, "epoch": 0.2457237336539907, "flos": 27416629238400.0, "grad_norm": 1.4285693402224882, "language_loss": 0.75361806, "learning_rate": 3.433449471284519e-06, "loss": 0.77479619, "num_input_tokens_seen": 87943115, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.5625, "step": 4087, "time_per_iteration": 2.438711643218994 }, { "auxiliary_loss_clip": 0.01089521, "auxiliary_loss_mlp": 0.01033842, "balance_loss_clip": 1.0160439, "balance_loss_mlp": 1.02893436, "epoch": 0.24578385690665866, "flos": 20411341447680.0, "grad_norm": 2.8593998768559756, "language_loss": 0.79570776, "learning_rate": 3.433185968969839e-06, "loss": 0.81694144, "num_input_tokens_seen": 87959505, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.60546875, "step": 4088, "time_per_iteration": 3.823397636413574 }, { "auxiliary_loss_clip": 0.01082546, "auxiliary_loss_mlp": 0.01026468, "balance_loss_clip": 1.00992155, "balance_loss_mlp": 1.02513552, "epoch": 0.24584398015932662, "flos": 23914247178240.0, "grad_norm": 1.441071505866467, "language_loss": 0.77050972, "learning_rate": 3.4329224155084386e-06, "loss": 0.79159987, "num_input_tokens_seen": 87979725, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.57421875, "step": 4089, "time_per_iteration": 2.4054677486419678 }, { "auxiliary_loss_clip": 0.01085736, "auxiliary_loss_mlp": 0.01040307, "balance_loss_clip": 1.02196074, "balance_loss_mlp": 1.0253849, "epoch": 0.2459041034119946, "flos": 41494866768000.0, "grad_norm": 2.22920423312802, "language_loss": 0.81344736, "learning_rate": 3.4326588109097236e-06, "loss": 0.8347078, "num_input_tokens_seen": 87998270, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.6015625, "step": 4090, "time_per_iteration": 2.5362229347229004 }, { "auxiliary_loss_clip": 0.01088724, "auxiliary_loss_mlp": 0.01037483, "balance_loss_clip": 1.01815963, "balance_loss_mlp": 1.02650738, "epoch": 0.24596422666466256, "flos": 19935824463360.0, "grad_norm": 1.7131929053449881, "language_loss": 0.73610687, "learning_rate": 3.4323951551831004e-06, "loss": 0.75736898, "num_input_tokens_seen": 88016760, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.62109375, "step": 4091, "time_per_iteration": 2.367933988571167 }, { "auxiliary_loss_clip": 0.01087415, "auxiliary_loss_mlp": 0.01037644, "balance_loss_clip": 1.0196197, "balance_loss_mlp": 1.02809274, "epoch": 0.24602434991733052, "flos": 21543295415040.0, "grad_norm": 2.608448135050141, "language_loss": 0.7709353, "learning_rate": 3.432131448337979e-06, "loss": 0.7921859, "num_input_tokens_seen": 88036465, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.59375, "step": 4092, "time_per_iteration": 3.747690439224243 }, { "auxiliary_loss_clip": 0.01088136, "auxiliary_loss_mlp": 0.01034006, "balance_loss_clip": 1.01592183, "balance_loss_mlp": 1.02500153, "epoch": 0.24608447316999849, "flos": 23183968936320.0, "grad_norm": 2.343426035774861, "language_loss": 0.81301463, "learning_rate": 3.43186769038377e-06, "loss": 0.83423603, "num_input_tokens_seen": 88053270, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.6328125, "step": 4093, "time_per_iteration": 2.376940965652466 }, { "auxiliary_loss_clip": 0.01090146, "auxiliary_loss_mlp": 0.0103768, "balance_loss_clip": 1.01833272, "balance_loss_mlp": 1.02576494, "epoch": 0.24614459642266648, "flos": 19641052920960.0, "grad_norm": 3.50730343237232, "language_loss": 0.8697294, "learning_rate": 3.431603881329886e-06, "loss": 0.89100766, "num_input_tokens_seen": 88072305, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.64453125, "step": 4094, "time_per_iteration": 2.37156081199646 }, { "auxiliary_loss_clip": 0.01017112, "auxiliary_loss_mlp": 0.01004962, "balance_loss_clip": 1.00329256, "balance_loss_mlp": 1.00362504, "epoch": 0.24620471967533444, "flos": 61736913699840.0, "grad_norm": 0.7448628737259854, "language_loss": 0.57499409, "learning_rate": 3.4313400211857424e-06, "loss": 0.59521484, "num_input_tokens_seen": 88137995, "router_z_loss_clip": 0.01672363, "router_z_loss_mlp": 0.13476562, "step": 4095, "time_per_iteration": 3.10062837600708 }, { "auxiliary_loss_clip": 0.01016659, "auxiliary_loss_mlp": 0.01002304, "balance_loss_clip": 1.00047994, "balance_loss_mlp": 1.00323367, "epoch": 0.2462648429280024, "flos": 69151103867520.0, "grad_norm": 0.6407135851517143, "language_loss": 0.56290764, "learning_rate": 3.431076109960755e-06, "loss": 0.58309728, "num_input_tokens_seen": 88208490, "router_z_loss_clip": 0.01818848, "router_z_loss_mlp": 0.13476562, "step": 4096, "time_per_iteration": 3.1554715633392334 }, { "auxiliary_loss_clip": 0.01087476, "auxiliary_loss_mlp": 0.01034048, "balance_loss_clip": 1.01619077, "balance_loss_mlp": 1.02752233, "epoch": 0.24632496618067037, "flos": 29458350080640.0, "grad_norm": 2.7101616297545945, "language_loss": 0.77540457, "learning_rate": 3.4308121476643423e-06, "loss": 0.79661977, "num_input_tokens_seen": 88228050, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.6015625, "step": 4097, "time_per_iteration": 2.458148241043091 }, { "auxiliary_loss_clip": 0.01090136, "auxiliary_loss_mlp": 0.01037499, "balance_loss_clip": 1.01834202, "balance_loss_mlp": 1.02733064, "epoch": 0.24638508943333834, "flos": 24315294499200.0, "grad_norm": 1.7717785365262917, "language_loss": 0.76084214, "learning_rate": 3.4305481343059254e-06, "loss": 0.7821185, "num_input_tokens_seen": 88248090, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.62890625, "step": 4098, "time_per_iteration": 2.4008679389953613 }, { "auxiliary_loss_clip": 0.01087306, "auxiliary_loss_mlp": 0.01036631, "balance_loss_clip": 1.01951289, "balance_loss_mlp": 1.0264585, "epoch": 0.2464452126860063, "flos": 26612090801280.0, "grad_norm": 2.4152333366728453, "language_loss": 0.68078029, "learning_rate": 3.4302840698949247e-06, "loss": 0.70201969, "num_input_tokens_seen": 88267545, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.609375, "step": 4099, "time_per_iteration": 2.4404308795928955 }, { "auxiliary_loss_clip": 0.01082462, "auxiliary_loss_mlp": 0.01034877, "balance_loss_clip": 1.01868868, "balance_loss_mlp": 1.02639103, "epoch": 0.24650533593867427, "flos": 31211059754880.0, "grad_norm": 1.7694417215531217, "language_loss": 0.65903497, "learning_rate": 3.430019954440764e-06, "loss": 0.68020833, "num_input_tokens_seen": 88289785, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.5625, "step": 4100, "time_per_iteration": 2.4761061668395996 }, { "auxiliary_loss_clip": 0.01016516, "auxiliary_loss_mlp": 0.01008392, "balance_loss_clip": 1.00666356, "balance_loss_mlp": 1.0029285, "epoch": 0.24656545919134226, "flos": 68490791723520.0, "grad_norm": 0.7184272720027738, "language_loss": 0.61550868, "learning_rate": 3.429755787952871e-06, "loss": 0.63575774, "num_input_tokens_seen": 88357320, "router_z_loss_clip": 0.01733398, "router_z_loss_mlp": 0.13574219, "step": 4101, "time_per_iteration": 3.150977373123169 }, { "auxiliary_loss_clip": 0.01082988, "auxiliary_loss_mlp": 0.01036277, "balance_loss_clip": 1.01843143, "balance_loss_mlp": 1.02551293, "epoch": 0.24662558244401023, "flos": 20083157867520.0, "grad_norm": 1.6896791858925082, "language_loss": 0.72792119, "learning_rate": 3.429491570440671e-06, "loss": 0.74911392, "num_input_tokens_seen": 88377040, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.578125, "step": 4102, "time_per_iteration": 2.373002052307129 }, { "auxiliary_loss_clip": 0.01086001, "auxiliary_loss_mlp": 0.01035045, "balance_loss_clip": 1.01811719, "balance_loss_mlp": 1.02502012, "epoch": 0.2466857056966782, "flos": 30700036051200.0, "grad_norm": 2.3135662645464823, "language_loss": 0.76031542, "learning_rate": 3.4292273019135936e-06, "loss": 0.78152585, "num_input_tokens_seen": 88395085, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.609375, "step": 4103, "time_per_iteration": 2.459496021270752 }, { "auxiliary_loss_clip": 0.01087263, "auxiliary_loss_mlp": 0.01031921, "balance_loss_clip": 1.01391983, "balance_loss_mlp": 1.02669179, "epoch": 0.24674582894934616, "flos": 22527427219200.0, "grad_norm": 1.96725708445454, "language_loss": 0.78242195, "learning_rate": 3.4289629823810707e-06, "loss": 0.80361378, "num_input_tokens_seen": 88413205, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.60546875, "step": 4104, "time_per_iteration": 2.374256134033203 }, { "auxiliary_loss_clip": 0.01087911, "auxiliary_loss_mlp": 0.01035006, "balance_loss_clip": 1.01605129, "balance_loss_mlp": 1.02754247, "epoch": 0.24680595220201412, "flos": 20703250258560.0, "grad_norm": 1.705733690607604, "language_loss": 0.83070034, "learning_rate": 3.4286986118525345e-06, "loss": 0.85192949, "num_input_tokens_seen": 88431525, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.6015625, "step": 4105, "time_per_iteration": 2.4024829864501953 }, { "auxiliary_loss_clip": 0.0108874, "auxiliary_loss_mlp": 0.01038361, "balance_loss_clip": 1.02142143, "balance_loss_mlp": 1.02883244, "epoch": 0.2468660754546821, "flos": 21830211901440.0, "grad_norm": 1.8265082566322628, "language_loss": 0.76055944, "learning_rate": 3.4284341903374196e-06, "loss": 0.78183043, "num_input_tokens_seen": 88451210, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.6015625, "step": 4106, "time_per_iteration": 2.3864710330963135 }, { "auxiliary_loss_clip": 0.01084883, "auxiliary_loss_mlp": 0.01035939, "balance_loss_clip": 1.01749706, "balance_loss_mlp": 1.02491546, "epoch": 0.24692619870735008, "flos": 15266819589120.0, "grad_norm": 2.2441239554161334, "language_loss": 0.71969068, "learning_rate": 3.4281697178451638e-06, "loss": 0.74089891, "num_input_tokens_seen": 88467790, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.6015625, "step": 4107, "time_per_iteration": 2.363778829574585 }, { "auxiliary_loss_clip": 0.01088094, "auxiliary_loss_mlp": 0.01035049, "balance_loss_clip": 1.01671481, "balance_loss_mlp": 1.02747977, "epoch": 0.24698632196001805, "flos": 29678791605120.0, "grad_norm": 1.5707344941189116, "language_loss": 0.65555215, "learning_rate": 3.4279051943852037e-06, "loss": 0.67678356, "num_input_tokens_seen": 88490330, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.60546875, "step": 4108, "time_per_iteration": 2.4472367763519287 }, { "auxiliary_loss_clip": 0.01087845, "auxiliary_loss_mlp": 0.01039652, "balance_loss_clip": 1.02007771, "balance_loss_mlp": 1.02634561, "epoch": 0.247046445212686, "flos": 39163925289600.0, "grad_norm": 2.2702505861453557, "language_loss": 0.72676706, "learning_rate": 3.42764061996698e-06, "loss": 0.74804205, "num_input_tokens_seen": 88512435, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.61328125, "step": 4109, "time_per_iteration": 2.537430763244629 }, { "auxiliary_loss_clip": 0.01088715, "auxiliary_loss_mlp": 0.0103969, "balance_loss_clip": 1.02140296, "balance_loss_mlp": 1.02731705, "epoch": 0.24710656846535398, "flos": 22997847144960.0, "grad_norm": 1.760139337317379, "language_loss": 0.78744268, "learning_rate": 3.4273759945999356e-06, "loss": 0.80872673, "num_input_tokens_seen": 88529780, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.61328125, "step": 4110, "time_per_iteration": 2.3913421630859375 }, { "auxiliary_loss_clip": 0.01086692, "auxiliary_loss_mlp": 0.01041434, "balance_loss_clip": 1.02338529, "balance_loss_mlp": 1.02663243, "epoch": 0.24716669171802194, "flos": 26431589738880.0, "grad_norm": 2.6811779058260323, "language_loss": 0.80902535, "learning_rate": 3.4271113182935134e-06, "loss": 0.83030665, "num_input_tokens_seen": 88547200, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.6015625, "step": 4111, "time_per_iteration": 2.443218946456909 }, { "auxiliary_loss_clip": 0.01086681, "auxiliary_loss_mlp": 0.01034782, "balance_loss_clip": 1.01852179, "balance_loss_mlp": 1.02719283, "epoch": 0.2472268149706899, "flos": 23328788722560.0, "grad_norm": 1.8822407601850477, "language_loss": 0.748734, "learning_rate": 3.4268465910571587e-06, "loss": 0.7699486, "num_input_tokens_seen": 88566415, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.59375, "step": 4112, "time_per_iteration": 2.400817632675171 }, { "auxiliary_loss_clip": 0.01086286, "auxiliary_loss_mlp": 0.01038362, "balance_loss_clip": 1.02139902, "balance_loss_mlp": 1.02570081, "epoch": 0.24728693822335787, "flos": 23767612001280.0, "grad_norm": 1.8161260878033203, "language_loss": 0.82026696, "learning_rate": 3.42658181290032e-06, "loss": 0.84151351, "num_input_tokens_seen": 88585225, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.60546875, "step": 4113, "time_per_iteration": 2.4078712463378906 }, { "auxiliary_loss_clip": 0.01083606, "auxiliary_loss_mlp": 0.01032953, "balance_loss_clip": 1.01504803, "balance_loss_mlp": 1.02510583, "epoch": 0.24734706147602586, "flos": 19316500122240.0, "grad_norm": 2.1578054812164176, "language_loss": 0.86835074, "learning_rate": 3.4263169838324458e-06, "loss": 0.88951635, "num_input_tokens_seen": 88603280, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.5859375, "step": 4114, "time_per_iteration": 2.366503953933716 }, { "auxiliary_loss_clip": 0.01086202, "auxiliary_loss_mlp": 0.01034469, "balance_loss_clip": 1.01860249, "balance_loss_mlp": 1.0259521, "epoch": 0.24740718472869383, "flos": 28035709200000.0, "grad_norm": 1.5478396973111115, "language_loss": 0.75643438, "learning_rate": 3.4260521038629878e-06, "loss": 0.77764106, "num_input_tokens_seen": 88624925, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.6015625, "step": 4115, "time_per_iteration": 2.4414401054382324 }, { "auxiliary_loss_clip": 0.01016377, "auxiliary_loss_mlp": 0.01006983, "balance_loss_clip": 1.00533772, "balance_loss_mlp": 1.00268722, "epoch": 0.2474673079813618, "flos": 68103953326080.0, "grad_norm": 0.68895193068118, "language_loss": 0.58217251, "learning_rate": 3.4257871730013974e-06, "loss": 0.60240614, "num_input_tokens_seen": 88691475, "router_z_loss_clip": 0.01647949, "router_z_loss_mlp": 0.13671875, "step": 4116, "time_per_iteration": 3.0879950523376465 }, { "auxiliary_loss_clip": 0.01084034, "auxiliary_loss_mlp": 0.01034311, "balance_loss_clip": 1.01683486, "balance_loss_mlp": 1.02543807, "epoch": 0.24752743123402976, "flos": 29460793875840.0, "grad_norm": 1.3854555118940515, "language_loss": 0.83491755, "learning_rate": 3.4255221912571315e-06, "loss": 0.85610104, "num_input_tokens_seen": 88713425, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.5859375, "step": 4117, "time_per_iteration": 2.466036081314087 }, { "auxiliary_loss_clip": 0.01016117, "auxiliary_loss_mlp": 0.01006709, "balance_loss_clip": 1.00502861, "balance_loss_mlp": 1.00260723, "epoch": 0.24758755448669773, "flos": 58347545310720.0, "grad_norm": 0.9030819957565785, "language_loss": 0.63517618, "learning_rate": 3.425257158639645e-06, "loss": 0.65540445, "num_input_tokens_seen": 88769995, "router_z_loss_clip": 0.0168457, "router_z_loss_mlp": 0.13476562, "step": 4118, "time_per_iteration": 2.903324604034424 }, { "auxiliary_loss_clip": 0.01084254, "auxiliary_loss_mlp": 0.01037389, "balance_loss_clip": 1.02037787, "balance_loss_mlp": 1.02594364, "epoch": 0.2476476777393657, "flos": 20483402227200.0, "grad_norm": 1.489226425737867, "language_loss": 0.79383802, "learning_rate": 3.424992075158397e-06, "loss": 0.81505442, "num_input_tokens_seen": 88789970, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.5859375, "step": 4119, "time_per_iteration": 2.4023826122283936 }, { "auxiliary_loss_clip": 0.01085144, "auxiliary_loss_mlp": 0.01033568, "balance_loss_clip": 1.01795745, "balance_loss_mlp": 1.02686667, "epoch": 0.24770780099203366, "flos": 20484798681600.0, "grad_norm": 1.4906477756345116, "language_loss": 0.74584258, "learning_rate": 3.4247269408228467e-06, "loss": 0.76702964, "num_input_tokens_seen": 88810000, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.58203125, "step": 4120, "time_per_iteration": 2.384490728378296 }, { "auxiliary_loss_clip": 0.01087263, "auxiliary_loss_mlp": 0.01043653, "balance_loss_clip": 1.02555668, "balance_loss_mlp": 1.02644324, "epoch": 0.24776792424470165, "flos": 15152653843200.0, "grad_norm": 1.8729883823275078, "language_loss": 0.88248277, "learning_rate": 3.424461755642457e-06, "loss": 0.90379196, "num_input_tokens_seen": 88827515, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.609375, "step": 4121, "time_per_iteration": 3.769474506378174 }, { "auxiliary_loss_clip": 0.01086276, "auxiliary_loss_mlp": 0.01035794, "balance_loss_clip": 1.01711369, "balance_loss_mlp": 1.02580178, "epoch": 0.2478280474973696, "flos": 21724389970560.0, "grad_norm": 4.090215250538317, "language_loss": 0.69454342, "learning_rate": 3.4241965196266912e-06, "loss": 0.71576416, "num_input_tokens_seen": 88845025, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.60546875, "step": 4122, "time_per_iteration": 2.377908229827881 }, { "auxiliary_loss_clip": 0.01085065, "auxiliary_loss_mlp": 0.01037677, "balance_loss_clip": 1.02011788, "balance_loss_mlp": 1.02521086, "epoch": 0.24788817075003758, "flos": 20411166890880.0, "grad_norm": 2.0516835943803082, "language_loss": 0.8037625, "learning_rate": 3.4239312327850155e-06, "loss": 0.82498991, "num_input_tokens_seen": 88861740, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.59765625, "step": 4123, "time_per_iteration": 2.3808038234710693 }, { "auxiliary_loss_clip": 0.01084513, "auxiliary_loss_mlp": 0.01037704, "balance_loss_clip": 1.02137208, "balance_loss_mlp": 1.02572, "epoch": 0.24794829400270554, "flos": 22593553067520.0, "grad_norm": 1.7197151615863469, "language_loss": 0.74973655, "learning_rate": 3.423665895126897e-06, "loss": 0.77095872, "num_input_tokens_seen": 88879740, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.5859375, "step": 4124, "time_per_iteration": 2.4803411960601807 }, { "auxiliary_loss_clip": 0.01084843, "auxiliary_loss_mlp": 0.0103097, "balance_loss_clip": 1.01540172, "balance_loss_mlp": 1.02780402, "epoch": 0.2480084172553735, "flos": 39674495145600.0, "grad_norm": 1.4040776014202108, "language_loss": 0.73531151, "learning_rate": 3.4234005066618047e-06, "loss": 0.75646973, "num_input_tokens_seen": 88904095, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.5703125, "step": 4125, "time_per_iteration": 3.9552083015441895 }, { "auxiliary_loss_clip": 0.01088419, "auxiliary_loss_mlp": 0.01038379, "balance_loss_clip": 1.02046132, "balance_loss_mlp": 1.02543163, "epoch": 0.24806854050804147, "flos": 22052643373440.0, "grad_norm": 1.9689184165616531, "language_loss": 0.69233143, "learning_rate": 3.4231350673992093e-06, "loss": 0.71359944, "num_input_tokens_seen": 88920740, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.62890625, "step": 4126, "time_per_iteration": 2.3816680908203125 }, { "auxiliary_loss_clip": 0.01086558, "auxiliary_loss_mlp": 0.01040118, "balance_loss_clip": 1.02257013, "balance_loss_mlp": 1.02685738, "epoch": 0.24812866376070947, "flos": 15485864659200.0, "grad_norm": 2.0665858456834374, "language_loss": 0.80779505, "learning_rate": 3.422869577348584e-06, "loss": 0.82906175, "num_input_tokens_seen": 88938510, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.59765625, "step": 4127, "time_per_iteration": 3.7433838844299316 }, { "auxiliary_loss_clip": 0.01088358, "auxiliary_loss_mlp": 0.01033442, "balance_loss_clip": 1.01672888, "balance_loss_mlp": 1.02749491, "epoch": 0.24818878701337743, "flos": 14756529024000.0, "grad_norm": 3.4168117397882307, "language_loss": 0.844118, "learning_rate": 3.422604036519404e-06, "loss": 0.865336, "num_input_tokens_seen": 88955235, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.609375, "step": 4128, "time_per_iteration": 2.366837501525879 }, { "auxiliary_loss_clip": 0.01085909, "auxiliary_loss_mlp": 0.01035099, "balance_loss_clip": 1.0185169, "balance_loss_mlp": 1.02622509, "epoch": 0.2482489102660454, "flos": 27088271101440.0, "grad_norm": 2.6987227673288805, "language_loss": 0.6540767, "learning_rate": 3.4223384449211457e-06, "loss": 0.67528689, "num_input_tokens_seen": 88975210, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.59765625, "step": 4129, "time_per_iteration": 2.4251272678375244 }, { "auxiliary_loss_clip": 0.01085872, "auxiliary_loss_mlp": 0.0103453, "balance_loss_clip": 1.01737642, "balance_loss_mlp": 1.0268786, "epoch": 0.24830903351871336, "flos": 26466363319680.0, "grad_norm": 2.252155087549559, "language_loss": 0.75110793, "learning_rate": 3.4220728025632863e-06, "loss": 0.77231193, "num_input_tokens_seen": 88996120, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.58984375, "step": 4130, "time_per_iteration": 2.4295639991760254 }, { "auxiliary_loss_clip": 0.01086163, "auxiliary_loss_mlp": 0.01034899, "balance_loss_clip": 1.01736295, "balance_loss_mlp": 1.02593648, "epoch": 0.24836915677138133, "flos": 10227805459200.0, "grad_norm": 2.069456691917796, "language_loss": 0.76421893, "learning_rate": 3.421807109455307e-06, "loss": 0.78542954, "num_input_tokens_seen": 89008685, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.6015625, "step": 4131, "time_per_iteration": 3.7015538215637207 }, { "auxiliary_loss_clip": 0.01083747, "auxiliary_loss_mlp": 0.01033424, "balance_loss_clip": 1.01816547, "balance_loss_mlp": 1.02656412, "epoch": 0.2484292800240493, "flos": 30079140698880.0, "grad_norm": 1.6020381465042943, "language_loss": 0.84020936, "learning_rate": 3.4215413656066893e-06, "loss": 0.86138105, "num_input_tokens_seen": 89031160, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.5703125, "step": 4132, "time_per_iteration": 2.455026865005493 }, { "auxiliary_loss_clip": 0.01087027, "auxiliary_loss_mlp": 0.01032721, "balance_loss_clip": 1.01451731, "balance_loss_mlp": 1.02686334, "epoch": 0.24848940327671726, "flos": 13442118958080.0, "grad_norm": 1.695454739942384, "language_loss": 0.7100206, "learning_rate": 3.4212755710269163e-06, "loss": 0.7312181, "num_input_tokens_seen": 89047235, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.6015625, "step": 4133, "time_per_iteration": 2.4285826683044434 }, { "auxiliary_loss_clip": 0.01090741, "auxiliary_loss_mlp": 0.01038368, "balance_loss_clip": 1.01745844, "balance_loss_mlp": 1.02635288, "epoch": 0.24854952652938525, "flos": 19969341235200.0, "grad_norm": 2.252359131562968, "language_loss": 0.60830545, "learning_rate": 3.4210097257254748e-06, "loss": 0.62959659, "num_input_tokens_seen": 89064790, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 0.64453125, "step": 4134, "time_per_iteration": 2.362736463546753 }, { "auxiliary_loss_clip": 0.01085969, "auxiliary_loss_mlp": 0.01036455, "balance_loss_clip": 1.01839435, "balance_loss_mlp": 1.02588129, "epoch": 0.24860964978205322, "flos": 18149213992320.0, "grad_norm": 1.9686896426832157, "language_loss": 0.7874074, "learning_rate": 3.420743829711851e-06, "loss": 0.80863166, "num_input_tokens_seen": 89083250, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.6015625, "step": 4135, "time_per_iteration": 2.378121852874756 }, { "auxiliary_loss_clip": 0.01088973, "auxiliary_loss_mlp": 0.01031641, "balance_loss_clip": 1.01552403, "balance_loss_mlp": 1.02883184, "epoch": 0.24866977303472118, "flos": 11727848557440.0, "grad_norm": 8.824193874715622, "language_loss": 0.83314431, "learning_rate": 3.420477882995535e-06, "loss": 0.85435045, "num_input_tokens_seen": 89100905, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.6015625, "step": 4136, "time_per_iteration": 2.3484787940979004 }, { "auxiliary_loss_clip": 0.01086764, "auxiliary_loss_mlp": 0.01038229, "balance_loss_clip": 1.02116978, "balance_loss_mlp": 1.02579129, "epoch": 0.24872989628738915, "flos": 34822161388800.0, "grad_norm": 1.8902196463562428, "language_loss": 0.70785689, "learning_rate": 3.420211885586017e-06, "loss": 0.72910678, "num_input_tokens_seen": 89122630, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.609375, "step": 4137, "time_per_iteration": 2.5101962089538574 }, { "auxiliary_loss_clip": 0.01086507, "auxiliary_loss_mlp": 0.01037768, "balance_loss_clip": 1.0215317, "balance_loss_mlp": 1.02494049, "epoch": 0.2487900195400571, "flos": 13698486138240.0, "grad_norm": 1.8895909219766391, "language_loss": 0.66491926, "learning_rate": 3.41994583749279e-06, "loss": 0.68616199, "num_input_tokens_seen": 89141050, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.61328125, "step": 4138, "time_per_iteration": 2.365785598754883 }, { "auxiliary_loss_clip": 0.01083291, "auxiliary_loss_mlp": 0.01031741, "balance_loss_clip": 1.0165236, "balance_loss_mlp": 1.02615142, "epoch": 0.24885014279272508, "flos": 25336643679360.0, "grad_norm": 1.858364232849583, "language_loss": 0.83803201, "learning_rate": 3.4196797387253482e-06, "loss": 0.85918236, "num_input_tokens_seen": 89160810, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.5703125, "step": 4139, "time_per_iteration": 2.4217495918273926 }, { "auxiliary_loss_clip": 0.01086816, "auxiliary_loss_mlp": 0.01035971, "balance_loss_clip": 1.01742208, "balance_loss_mlp": 1.02614295, "epoch": 0.24891026604539307, "flos": 20630386517760.0, "grad_norm": 1.484568005547756, "language_loss": 0.78808331, "learning_rate": 3.419413589293189e-06, "loss": 0.80931115, "num_input_tokens_seen": 89180610, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.60546875, "step": 4140, "time_per_iteration": 2.404123067855835 }, { "auxiliary_loss_clip": 0.01015239, "auxiliary_loss_mlp": 0.01004717, "balance_loss_clip": 1.00309598, "balance_loss_mlp": 1.00217843, "epoch": 0.24897038929806103, "flos": 66957162030720.0, "grad_norm": 0.83112928286704, "language_loss": 0.61025429, "learning_rate": 3.4191473892058094e-06, "loss": 0.63045382, "num_input_tokens_seen": 89241880, "router_z_loss_clip": 0.01623535, "router_z_loss_mlp": 0.13085938, "step": 4141, "time_per_iteration": 3.082777261734009 }, { "auxiliary_loss_clip": 0.01089157, "auxiliary_loss_mlp": 0.01040758, "balance_loss_clip": 1.02350843, "balance_loss_mlp": 1.02753234, "epoch": 0.249030512550729, "flos": 36391088332800.0, "grad_norm": 1.8390721993300967, "language_loss": 0.72601914, "learning_rate": 3.4188811384727104e-06, "loss": 0.74731827, "num_input_tokens_seen": 89263340, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.6171875, "step": 4142, "time_per_iteration": 2.5076329708099365 }, { "auxiliary_loss_clip": 0.01088461, "auxiliary_loss_mlp": 0.01032706, "balance_loss_clip": 1.01664865, "balance_loss_mlp": 1.02794802, "epoch": 0.24909063580339696, "flos": 20153612724480.0, "grad_norm": 1.6955684921265535, "language_loss": 0.80873203, "learning_rate": 3.418614837103393e-06, "loss": 0.82994366, "num_input_tokens_seen": 89282870, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.60546875, "step": 4143, "time_per_iteration": 2.396929979324341 }, { "auxiliary_loss_clip": 0.01080377, "auxiliary_loss_mlp": 0.01029764, "balance_loss_clip": 1.01424301, "balance_loss_mlp": 1.02441239, "epoch": 0.24915075905606493, "flos": 26395349880960.0, "grad_norm": 1.850173296193879, "language_loss": 0.58839977, "learning_rate": 3.418348485107362e-06, "loss": 0.60950112, "num_input_tokens_seen": 89303830, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.55859375, "step": 4144, "time_per_iteration": 2.4307267665863037 }, { "auxiliary_loss_clip": 0.01082384, "auxiliary_loss_mlp": 0.01035851, "balance_loss_clip": 1.01903057, "balance_loss_mlp": 1.02435446, "epoch": 0.2492108823087329, "flos": 27525977216640.0, "grad_norm": 8.220650966989995, "language_loss": 0.78757977, "learning_rate": 3.4180820824941213e-06, "loss": 0.80876213, "num_input_tokens_seen": 89324350, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.578125, "step": 4145, "time_per_iteration": 2.43618106842041 }, { "auxiliary_loss_clip": 0.01094052, "auxiliary_loss_mlp": 0.01040673, "balance_loss_clip": 1.01956058, "balance_loss_mlp": 1.02610385, "epoch": 0.24927100556140086, "flos": 16690437987840.0, "grad_norm": 1.9456226615546552, "language_loss": 0.65626216, "learning_rate": 3.4178156292731787e-06, "loss": 0.67760944, "num_input_tokens_seen": 89342875, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 0.6796875, "step": 4146, "time_per_iteration": 2.3641622066497803 }, { "auxiliary_loss_clip": 0.01015156, "auxiliary_loss_mlp": 0.01001284, "balance_loss_clip": 0.99965078, "balance_loss_mlp": 1.00193524, "epoch": 0.24933112881406885, "flos": 62769225047040.0, "grad_norm": 0.9470214086913717, "language_loss": 0.6726746, "learning_rate": 3.4175491254540436e-06, "loss": 0.69283903, "num_input_tokens_seen": 89404925, "router_z_loss_clip": 0.01635742, "router_z_loss_mlp": 0.13183594, "step": 4147, "time_per_iteration": 3.1399781703948975 }, { "auxiliary_loss_clip": 0.01088461, "auxiliary_loss_mlp": 0.01040376, "balance_loss_clip": 1.02336502, "balance_loss_mlp": 1.02815604, "epoch": 0.24939125206673682, "flos": 26650669720320.0, "grad_norm": 1.7022715726379114, "language_loss": 0.89207381, "learning_rate": 3.4172825710462267e-06, "loss": 0.91336215, "num_input_tokens_seen": 89425090, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.6015625, "step": 4148, "time_per_iteration": 2.417715072631836 }, { "auxiliary_loss_clip": 0.01090791, "auxiliary_loss_mlp": 0.01041572, "balance_loss_clip": 1.02139008, "balance_loss_mlp": 1.02708268, "epoch": 0.24945137531940478, "flos": 20703285169920.0, "grad_norm": 1.952899303527093, "language_loss": 0.68199652, "learning_rate": 3.4170159660592404e-06, "loss": 0.70332015, "num_input_tokens_seen": 89442615, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.63671875, "step": 4149, "time_per_iteration": 2.385103702545166 }, { "auxiliary_loss_clip": 0.01084708, "auxiliary_loss_mlp": 0.01031042, "balance_loss_clip": 1.01358342, "balance_loss_mlp": 1.02614653, "epoch": 0.24951149857207275, "flos": 23767542178560.0, "grad_norm": 1.691960060504696, "language_loss": 0.7100659, "learning_rate": 3.416749310502599e-06, "loss": 0.73122334, "num_input_tokens_seen": 89463025, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.5859375, "step": 4150, "time_per_iteration": 2.413113594055176 }, { "auxiliary_loss_clip": 0.01087064, "auxiliary_loss_mlp": 0.01037824, "balance_loss_clip": 1.01902509, "balance_loss_mlp": 1.02616155, "epoch": 0.2495716218247407, "flos": 15664096483200.0, "grad_norm": 1.7552566589860064, "language_loss": 0.72874904, "learning_rate": 3.4164826043858195e-06, "loss": 0.74999797, "num_input_tokens_seen": 89480225, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.609375, "step": 4151, "time_per_iteration": 2.353670358657837 }, { "auxiliary_loss_clip": 0.01092564, "auxiliary_loss_mlp": 0.01044131, "balance_loss_clip": 1.02511668, "balance_loss_mlp": 1.02713895, "epoch": 0.24963174507740868, "flos": 24051595933440.0, "grad_norm": 2.5621696544024126, "language_loss": 0.63709629, "learning_rate": 3.416215847718419e-06, "loss": 0.65846318, "num_input_tokens_seen": 89496985, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.65234375, "step": 4152, "time_per_iteration": 2.3920669555664062 }, { "auxiliary_loss_clip": 0.01085249, "auxiliary_loss_mlp": 0.01038473, "balance_loss_clip": 1.02147388, "balance_loss_mlp": 1.02740765, "epoch": 0.24969186833007664, "flos": 21798405786240.0, "grad_norm": 2.2758752004914005, "language_loss": 0.77126729, "learning_rate": 3.4159490405099183e-06, "loss": 0.79250455, "num_input_tokens_seen": 89514420, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.578125, "step": 4153, "time_per_iteration": 2.3825888633728027 }, { "auxiliary_loss_clip": 0.01084138, "auxiliary_loss_mlp": 0.01033125, "balance_loss_clip": 1.01632833, "balance_loss_mlp": 1.02591348, "epoch": 0.24975199158274464, "flos": 19937116183680.0, "grad_norm": 1.7964037245080868, "language_loss": 0.76339287, "learning_rate": 3.4156821827698387e-06, "loss": 0.78456545, "num_input_tokens_seen": 89532925, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.5859375, "step": 4154, "time_per_iteration": 2.3665268421173096 }, { "auxiliary_loss_clip": 0.01090074, "auxiliary_loss_mlp": 0.01036943, "balance_loss_clip": 1.01710677, "balance_loss_mlp": 1.02616119, "epoch": 0.2498121148354126, "flos": 25337202261120.0, "grad_norm": 2.1806513294402747, "language_loss": 0.70880032, "learning_rate": 3.4154152745077027e-06, "loss": 0.73007047, "num_input_tokens_seen": 89552855, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.640625, "step": 4155, "time_per_iteration": 2.402557134628296 }, { "auxiliary_loss_clip": 0.01088748, "auxiliary_loss_mlp": 0.01040095, "balance_loss_clip": 1.02145052, "balance_loss_mlp": 1.02694702, "epoch": 0.24987223808808057, "flos": 20557732245120.0, "grad_norm": 1.5898528415838775, "language_loss": 0.75153297, "learning_rate": 3.4151483157330373e-06, "loss": 0.77282143, "num_input_tokens_seen": 89572830, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.6171875, "step": 4156, "time_per_iteration": 2.3852381706237793 }, { "auxiliary_loss_clip": 0.01085636, "auxiliary_loss_mlp": 0.01030915, "balance_loss_clip": 1.0144881, "balance_loss_mlp": 1.02557111, "epoch": 0.24993236134074853, "flos": 19748201483520.0, "grad_norm": 3.2139917966064724, "language_loss": 0.76728547, "learning_rate": 3.4148813064553686e-06, "loss": 0.78845096, "num_input_tokens_seen": 89590345, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.6015625, "step": 4157, "time_per_iteration": 2.372711181640625 }, { "auxiliary_loss_clip": 0.01086456, "auxiliary_loss_mlp": 0.01035255, "balance_loss_clip": 1.01759982, "balance_loss_mlp": 1.02622128, "epoch": 0.2499924845934165, "flos": 18769306383360.0, "grad_norm": 1.5277581234599116, "language_loss": 0.81389397, "learning_rate": 3.4146142466842253e-06, "loss": 0.83511102, "num_input_tokens_seen": 89610295, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.6015625, "step": 4158, "time_per_iteration": 2.3913378715515137 }, { "auxiliary_loss_clip": 0.01087098, "auxiliary_loss_mlp": 0.01029935, "balance_loss_clip": 1.01334131, "balance_loss_mlp": 1.0268569, "epoch": 0.25005260784608446, "flos": 16871288163840.0, "grad_norm": 1.8097729144184502, "language_loss": 0.76135957, "learning_rate": 3.414347136429138e-06, "loss": 0.78252989, "num_input_tokens_seen": 89627795, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.6015625, "step": 4159, "time_per_iteration": 2.3434834480285645 }, { "auxiliary_loss_clip": 0.01089061, "auxiliary_loss_mlp": 0.01033959, "balance_loss_clip": 1.01550508, "balance_loss_mlp": 1.02670324, "epoch": 0.2501127310987524, "flos": 22123901191680.0, "grad_norm": 1.867266487831828, "language_loss": 0.7131983, "learning_rate": 3.4140799756996403e-06, "loss": 0.73442852, "num_input_tokens_seen": 89648090, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.625, "step": 4160, "time_per_iteration": 2.400459051132202 }, { "auxiliary_loss_clip": 0.01015589, "auxiliary_loss_mlp": 0.01007315, "balance_loss_clip": 1.00575364, "balance_loss_mlp": 1.00244224, "epoch": 0.2501728543514204, "flos": 69454393781760.0, "grad_norm": 0.7484803095786234, "language_loss": 0.56746018, "learning_rate": 3.4138127645052653e-06, "loss": 0.58768922, "num_input_tokens_seen": 89710345, "router_z_loss_clip": 0.015625, "router_z_loss_mlp": 0.13085938, "step": 4161, "time_per_iteration": 4.48866081237793 }, { "auxiliary_loss_clip": 0.01093328, "auxiliary_loss_mlp": 0.01042829, "balance_loss_clip": 1.02357638, "balance_loss_mlp": 1.02814245, "epoch": 0.25023297760408836, "flos": 16289041553280.0, "grad_norm": 1.6335203249332388, "language_loss": 0.80808181, "learning_rate": 3.41354550285555e-06, "loss": 0.8294434, "num_input_tokens_seen": 89729390, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.65234375, "step": 4162, "time_per_iteration": 2.3679251670837402 }, { "auxiliary_loss_clip": 0.01087607, "auxiliary_loss_mlp": 0.01036578, "balance_loss_clip": 1.01810074, "balance_loss_mlp": 1.02499318, "epoch": 0.2502931008567563, "flos": 12237231427200.0, "grad_norm": 2.073921826240484, "language_loss": 0.87346721, "learning_rate": 3.413278190760031e-06, "loss": 0.89470905, "num_input_tokens_seen": 89742805, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.625, "step": 4163, "time_per_iteration": 2.352168083190918 }, { "auxiliary_loss_clip": 0.01087454, "auxiliary_loss_mlp": 0.01034365, "balance_loss_clip": 1.01659131, "balance_loss_mlp": 1.0266645, "epoch": 0.25035322410942434, "flos": 23180861825280.0, "grad_norm": 1.5284149766660347, "language_loss": 0.83000046, "learning_rate": 3.413010828228249e-06, "loss": 0.85121864, "num_input_tokens_seen": 89761145, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.60546875, "step": 4164, "time_per_iteration": 2.3776345252990723 }, { "auxiliary_loss_clip": 0.01086314, "auxiliary_loss_mlp": 0.01038361, "balance_loss_clip": 1.02285218, "balance_loss_mlp": 1.02929795, "epoch": 0.2504133473620923, "flos": 20916639688320.0, "grad_norm": 1.6790768031802228, "language_loss": 0.7416389, "learning_rate": 3.4127434152697453e-06, "loss": 0.76288569, "num_input_tokens_seen": 89780905, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.5703125, "step": 4165, "time_per_iteration": 3.7848379611968994 }, { "auxiliary_loss_clip": 0.01090005, "auxiliary_loss_mlp": 0.0103736, "balance_loss_clip": 1.01851296, "balance_loss_mlp": 1.02677178, "epoch": 0.2504734706147603, "flos": 20775520506240.0, "grad_norm": 1.6360191852191746, "language_loss": 0.73811507, "learning_rate": 3.4124759518940637e-06, "loss": 0.75938869, "num_input_tokens_seen": 89799230, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.6328125, "step": 4166, "time_per_iteration": 2.3740110397338867 }, { "auxiliary_loss_clip": 0.01082551, "auxiliary_loss_mlp": 0.01040048, "balance_loss_clip": 1.02244079, "balance_loss_mlp": 1.02522874, "epoch": 0.25053359386742824, "flos": 24348322512000.0, "grad_norm": 1.6573165904586133, "language_loss": 0.8177495, "learning_rate": 3.412208438110748e-06, "loss": 0.83897543, "num_input_tokens_seen": 89818240, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.57421875, "step": 4167, "time_per_iteration": 3.774843215942383 }, { "auxiliary_loss_clip": 0.01085933, "auxiliary_loss_mlp": 0.01033667, "balance_loss_clip": 1.01667368, "balance_loss_mlp": 1.02652943, "epoch": 0.2505937171200962, "flos": 21213296444160.0, "grad_norm": 2.054142096798254, "language_loss": 0.79331625, "learning_rate": 3.411940873929346e-06, "loss": 0.81451225, "num_input_tokens_seen": 89834485, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.59375, "step": 4168, "time_per_iteration": 2.3814868927001953 }, { "auxiliary_loss_clip": 0.01089124, "auxiliary_loss_mlp": 0.01037903, "balance_loss_clip": 1.01776874, "balance_loss_mlp": 1.02649546, "epoch": 0.25065384037276417, "flos": 41425633808640.0, "grad_norm": 1.949754426550837, "language_loss": 0.69879848, "learning_rate": 3.411673259359406e-06, "loss": 0.72006875, "num_input_tokens_seen": 89855645, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.625, "step": 4169, "time_per_iteration": 2.551462173461914 }, { "auxiliary_loss_clip": 0.01082874, "auxiliary_loss_mlp": 0.01036984, "balance_loss_clip": 1.02077127, "balance_loss_mlp": 1.02544272, "epoch": 0.25071396362543213, "flos": 26101241654400.0, "grad_norm": 1.689448057975862, "language_loss": 0.7732088, "learning_rate": 3.411405594410479e-06, "loss": 0.79440737, "num_input_tokens_seen": 89874895, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.57421875, "step": 4170, "time_per_iteration": 2.441685676574707 }, { "auxiliary_loss_clip": 0.01085652, "auxiliary_loss_mlp": 0.01034209, "balance_loss_clip": 1.01724577, "balance_loss_mlp": 1.02592623, "epoch": 0.2507740868781001, "flos": 19097978722560.0, "grad_norm": 2.3655046764559784, "language_loss": 0.76594567, "learning_rate": 3.4111378790921162e-06, "loss": 0.78714424, "num_input_tokens_seen": 89891700, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.59765625, "step": 4171, "time_per_iteration": 3.7297091484069824 }, { "auxiliary_loss_clip": 0.01019602, "auxiliary_loss_mlp": 0.01005348, "balance_loss_clip": 1.0035243, "balance_loss_mlp": 1.00611258, "epoch": 0.25083421013076806, "flos": 64338570927360.0, "grad_norm": 0.8355812473618281, "language_loss": 0.60083926, "learning_rate": 3.4108701134138727e-06, "loss": 0.62108874, "num_input_tokens_seen": 89955775, "router_z_loss_clip": 0.01818848, "router_z_loss_mlp": 0.13476562, "step": 4172, "time_per_iteration": 3.0390610694885254 }, { "auxiliary_loss_clip": 0.01086664, "auxiliary_loss_mlp": 0.01034131, "balance_loss_clip": 1.0154984, "balance_loss_mlp": 1.02559733, "epoch": 0.25089433338343603, "flos": 24278461148160.0, "grad_norm": 1.419423819463803, "language_loss": 0.78949177, "learning_rate": 3.4106022973853045e-06, "loss": 0.8106997, "num_input_tokens_seen": 89977150, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.609375, "step": 4173, "time_per_iteration": 2.4250080585479736 }, { "auxiliary_loss_clip": 0.01086158, "auxiliary_loss_mlp": 0.01038105, "balance_loss_clip": 1.01974726, "balance_loss_mlp": 1.0268693, "epoch": 0.250954456636104, "flos": 14720568456960.0, "grad_norm": 1.7970204160651273, "language_loss": 0.83641088, "learning_rate": 3.4103344310159685e-06, "loss": 0.8576535, "num_input_tokens_seen": 89994925, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.59375, "step": 4174, "time_per_iteration": 2.3676953315734863 }, { "auxiliary_loss_clip": 0.01089318, "auxiliary_loss_mlp": 0.01039375, "balance_loss_clip": 1.02039671, "balance_loss_mlp": 1.02797008, "epoch": 0.25101457988877196, "flos": 22272491404800.0, "grad_norm": 2.0000421087695583, "language_loss": 0.71564239, "learning_rate": 3.4100665143154245e-06, "loss": 0.7369293, "num_input_tokens_seen": 90013235, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.61328125, "step": 4175, "time_per_iteration": 2.391385555267334 }, { "auxiliary_loss_clip": 0.01085807, "auxiliary_loss_mlp": 0.01031672, "balance_loss_clip": 1.01374292, "balance_loss_mlp": 1.02490354, "epoch": 0.2510747031414399, "flos": 25187843998080.0, "grad_norm": 2.066901444341313, "language_loss": 0.80781257, "learning_rate": 3.409798547293234e-06, "loss": 0.82898736, "num_input_tokens_seen": 90032150, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.609375, "step": 4176, "time_per_iteration": 2.411552906036377 }, { "auxiliary_loss_clip": 0.01090539, "auxiliary_loss_mlp": 0.01033101, "balance_loss_clip": 1.01437354, "balance_loss_mlp": 1.02882624, "epoch": 0.25113482639410795, "flos": 20703145524480.0, "grad_norm": 1.8239141578092002, "language_loss": 0.8288976, "learning_rate": 3.4095305299589593e-06, "loss": 0.85013407, "num_input_tokens_seen": 90049085, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.6171875, "step": 4177, "time_per_iteration": 2.3847196102142334 }, { "auxiliary_loss_clip": 0.01090298, "auxiliary_loss_mlp": 0.01039912, "balance_loss_clip": 1.02176857, "balance_loss_mlp": 1.02995527, "epoch": 0.2511949496467759, "flos": 21505868570880.0, "grad_norm": 2.534211058518571, "language_loss": 0.82993323, "learning_rate": 3.409262462322166e-06, "loss": 0.85123539, "num_input_tokens_seen": 90067695, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.6015625, "step": 4178, "time_per_iteration": 2.3937385082244873 }, { "auxiliary_loss_clip": 0.01085071, "auxiliary_loss_mlp": 0.01038297, "balance_loss_clip": 1.02128625, "balance_loss_mlp": 1.0263536, "epoch": 0.2512550728994439, "flos": 20701015931520.0, "grad_norm": 2.069309614923113, "language_loss": 0.75970161, "learning_rate": 3.40899434439242e-06, "loss": 0.78093529, "num_input_tokens_seen": 90083890, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.5859375, "step": 4179, "time_per_iteration": 2.383949041366577 }, { "auxiliary_loss_clip": 0.01088564, "auxiliary_loss_mlp": 0.01040843, "balance_loss_clip": 1.02212667, "balance_loss_mlp": 1.02778363, "epoch": 0.25131519615211184, "flos": 18477641952000.0, "grad_norm": 1.8852729951139797, "language_loss": 0.70328605, "learning_rate": 3.4087261761792908e-06, "loss": 0.72458005, "num_input_tokens_seen": 90100995, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.609375, "step": 4180, "time_per_iteration": 2.3751630783081055 }, { "auxiliary_loss_clip": 0.01088551, "auxiliary_loss_mlp": 0.01038615, "balance_loss_clip": 1.02038801, "balance_loss_mlp": 1.02811086, "epoch": 0.2513753194047798, "flos": 20483925897600.0, "grad_norm": 2.393589595119289, "language_loss": 0.86134291, "learning_rate": 3.4084579576923477e-06, "loss": 0.88261461, "num_input_tokens_seen": 90120365, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.60546875, "step": 4181, "time_per_iteration": 2.387640953063965 }, { "auxiliary_loss_clip": 0.01086036, "auxiliary_loss_mlp": 0.01035151, "balance_loss_clip": 1.01811576, "balance_loss_mlp": 1.02726042, "epoch": 0.25143544265744777, "flos": 37668560313600.0, "grad_norm": 1.9317609465821208, "language_loss": 0.68366534, "learning_rate": 3.4081896889411634e-06, "loss": 0.7048772, "num_input_tokens_seen": 90142610, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.5859375, "step": 4182, "time_per_iteration": 2.5509138107299805 }, { "auxiliary_loss_clip": 0.01018334, "auxiliary_loss_mlp": 0.01004811, "balance_loss_clip": 1.00305831, "balance_loss_mlp": 1.00486362, "epoch": 0.25149556591011574, "flos": 69364283829120.0, "grad_norm": 0.8462462626347791, "language_loss": 0.700665, "learning_rate": 3.407921369935311e-06, "loss": 0.72089636, "num_input_tokens_seen": 90200555, "router_z_loss_clip": 0.01757812, "router_z_loss_mlp": 0.13476562, "step": 4183, "time_per_iteration": 3.0337069034576416 }, { "auxiliary_loss_clip": 0.01085617, "auxiliary_loss_mlp": 0.01040285, "balance_loss_clip": 1.02190304, "balance_loss_mlp": 1.02508175, "epoch": 0.2515556891627837, "flos": 13989557076480.0, "grad_norm": 1.8504628554017286, "language_loss": 0.74262583, "learning_rate": 3.407653000684367e-06, "loss": 0.7638849, "num_input_tokens_seen": 90218120, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.60546875, "step": 4184, "time_per_iteration": 2.3725948333740234 }, { "auxiliary_loss_clip": 0.01089891, "auxiliary_loss_mlp": 0.01037587, "balance_loss_clip": 1.01989603, "balance_loss_mlp": 1.03025913, "epoch": 0.25161581241545167, "flos": 22162445199360.0, "grad_norm": 1.7187934828601634, "language_loss": 0.83149958, "learning_rate": 3.407384581197908e-06, "loss": 0.85277438, "num_input_tokens_seen": 90236790, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.59375, "step": 4185, "time_per_iteration": 2.37890362739563 }, { "auxiliary_loss_clip": 0.01016642, "auxiliary_loss_mlp": 0.01001929, "balance_loss_clip": 1.00006974, "balance_loss_mlp": 1.00283837, "epoch": 0.25167593566811963, "flos": 69355486166400.0, "grad_norm": 0.7935930278087194, "language_loss": 0.61524451, "learning_rate": 3.4071161114855134e-06, "loss": 0.63543022, "num_input_tokens_seen": 90297070, "router_z_loss_clip": 0.01855469, "router_z_loss_mlp": 0.13867188, "step": 4186, "time_per_iteration": 2.9305331707000732 }, { "auxiliary_loss_clip": 0.01085627, "auxiliary_loss_mlp": 0.01037325, "balance_loss_clip": 1.01971817, "balance_loss_mlp": 1.02617478, "epoch": 0.2517360589207876, "flos": 13260605466240.0, "grad_norm": 1.849702776315767, "language_loss": 0.78915787, "learning_rate": 3.406847591556764e-06, "loss": 0.81038737, "num_input_tokens_seen": 90315255, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.59375, "step": 4187, "time_per_iteration": 2.353400468826294 }, { "auxiliary_loss_clip": 0.0108619, "auxiliary_loss_mlp": 0.01044634, "balance_loss_clip": 1.02737236, "balance_loss_mlp": 1.02780402, "epoch": 0.25179618217345556, "flos": 20375764905600.0, "grad_norm": 1.4541123230854403, "language_loss": 0.79604644, "learning_rate": 3.406579021421244e-06, "loss": 0.81735468, "num_input_tokens_seen": 90334990, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.5859375, "step": 4188, "time_per_iteration": 2.3837485313415527 }, { "auxiliary_loss_clip": 0.01083816, "auxiliary_loss_mlp": 0.01040109, "balance_loss_clip": 1.02266884, "balance_loss_mlp": 1.02533996, "epoch": 0.25185630542612353, "flos": 27663709996800.0, "grad_norm": 1.8785720766922807, "language_loss": 0.74433601, "learning_rate": 3.406310401088536e-06, "loss": 0.76557529, "num_input_tokens_seen": 90351825, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.5859375, "step": 4189, "time_per_iteration": 2.4144246578216553 }, { "auxiliary_loss_clip": 0.01082466, "auxiliary_loss_mlp": 0.01033653, "balance_loss_clip": 1.01698756, "balance_loss_mlp": 1.0256393, "epoch": 0.25191642867879155, "flos": 20995368537600.0, "grad_norm": 1.9791243396858431, "language_loss": 0.84094393, "learning_rate": 3.4060417305682274e-06, "loss": 0.86210507, "num_input_tokens_seen": 90369860, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.56640625, "step": 4190, "time_per_iteration": 2.387373685836792 }, { "auxiliary_loss_clip": 0.01088221, "auxiliary_loss_mlp": 0.01037147, "balance_loss_clip": 1.01832414, "balance_loss_mlp": 1.02850175, "epoch": 0.2519765519314595, "flos": 21104611781760.0, "grad_norm": 2.8854020559471363, "language_loss": 0.75412244, "learning_rate": 3.4057730098699065e-06, "loss": 0.77537614, "num_input_tokens_seen": 90389245, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.59765625, "step": 4191, "time_per_iteration": 2.384077548980713 }, { "auxiliary_loss_clip": 0.01022705, "auxiliary_loss_mlp": 0.01001011, "balance_loss_clip": 0.99906814, "balance_loss_mlp": 1.00911868, "epoch": 0.2520366751841275, "flos": 62741503560960.0, "grad_norm": 1.3038135781693232, "language_loss": 0.57152498, "learning_rate": 3.405504239003163e-06, "loss": 0.59176207, "num_input_tokens_seen": 90456735, "router_z_loss_clip": 0.01940918, "router_z_loss_mlp": 0.13574219, "step": 4192, "time_per_iteration": 3.1122801303863525 }, { "auxiliary_loss_clip": 0.0108913, "auxiliary_loss_mlp": 0.01036046, "balance_loss_clip": 1.01792598, "balance_loss_mlp": 1.03009593, "epoch": 0.25209679843679544, "flos": 22229792945280.0, "grad_norm": 1.962746330273317, "language_loss": 0.76137161, "learning_rate": 3.4052354179775883e-06, "loss": 0.78262341, "num_input_tokens_seen": 90474165, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.59375, "step": 4193, "time_per_iteration": 2.3875482082366943 }, { "auxiliary_loss_clip": 0.01089056, "auxiliary_loss_mlp": 0.01036504, "balance_loss_clip": 1.01816988, "balance_loss_mlp": 1.02842879, "epoch": 0.2521569216894634, "flos": 12165833963520.0, "grad_norm": 2.4049709684284055, "language_loss": 0.83728766, "learning_rate": 3.4049665468027763e-06, "loss": 0.85854328, "num_input_tokens_seen": 90491660, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.60546875, "step": 4194, "time_per_iteration": 2.385765314102173 }, { "auxiliary_loss_clip": 0.01088363, "auxiliary_loss_mlp": 0.01038036, "balance_loss_clip": 1.0210371, "balance_loss_mlp": 1.02698064, "epoch": 0.2522170449421314, "flos": 23698553598720.0, "grad_norm": 1.4476206676743426, "language_loss": 0.88394225, "learning_rate": 3.404697625488322e-06, "loss": 0.90520626, "num_input_tokens_seen": 90514025, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.61328125, "step": 4195, "time_per_iteration": 2.4313528537750244 }, { "auxiliary_loss_clip": 0.01088156, "auxiliary_loss_mlp": 0.01036198, "balance_loss_clip": 1.01648068, "balance_loss_mlp": 1.02781153, "epoch": 0.25227716819479934, "flos": 20954520380160.0, "grad_norm": 2.4660462571920565, "language_loss": 0.86479378, "learning_rate": 3.4044286540438233e-06, "loss": 0.88603729, "num_input_tokens_seen": 90533530, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.6015625, "step": 4196, "time_per_iteration": 2.407398223876953 }, { "auxiliary_loss_clip": 0.01087827, "auxiliary_loss_mlp": 0.01037191, "balance_loss_clip": 1.01951194, "balance_loss_mlp": 1.0276016, "epoch": 0.2523372914474673, "flos": 23330220088320.0, "grad_norm": 1.7398215060718052, "language_loss": 0.83336049, "learning_rate": 3.4041596324788778e-06, "loss": 0.85461068, "num_input_tokens_seen": 90554025, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.6015625, "step": 4197, "time_per_iteration": 2.426469326019287 }, { "auxiliary_loss_clip": 0.01092347, "auxiliary_loss_mlp": 0.01037642, "balance_loss_clip": 1.01761484, "balance_loss_mlp": 1.02999306, "epoch": 0.25239741470013527, "flos": 36969005934720.0, "grad_norm": 1.8691213880443476, "language_loss": 0.72345132, "learning_rate": 3.403890560803088e-06, "loss": 0.74475121, "num_input_tokens_seen": 90576930, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.625, "step": 4198, "time_per_iteration": 2.512997627258301 }, { "auxiliary_loss_clip": 0.0109069, "auxiliary_loss_mlp": 0.01040852, "balance_loss_clip": 1.02180219, "balance_loss_mlp": 1.02847815, "epoch": 0.25245753795280323, "flos": 18514754593920.0, "grad_norm": 1.7566041300172366, "language_loss": 0.77064091, "learning_rate": 3.4036214390260546e-06, "loss": 0.79195631, "num_input_tokens_seen": 90595710, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.62109375, "step": 4199, "time_per_iteration": 2.366682529449463 }, { "auxiliary_loss_clip": 0.01086724, "auxiliary_loss_mlp": 0.01032855, "balance_loss_clip": 1.01598668, "balance_loss_mlp": 1.02713084, "epoch": 0.2525176612054712, "flos": 32343467569920.0, "grad_norm": 1.9349715598519899, "language_loss": 0.73080075, "learning_rate": 3.403352267157383e-06, "loss": 0.75199652, "num_input_tokens_seen": 90617945, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.59375, "step": 4200, "time_per_iteration": 2.491309404373169 }, { "auxiliary_loss_clip": 0.01088802, "auxiliary_loss_mlp": 0.01036934, "balance_loss_clip": 1.02017355, "balance_loss_mlp": 1.02801931, "epoch": 0.25257778445813917, "flos": 45256513651200.0, "grad_norm": 1.5135461051709393, "language_loss": 0.82346237, "learning_rate": 3.4030830452066785e-06, "loss": 0.84471977, "num_input_tokens_seen": 90640855, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.60546875, "step": 4201, "time_per_iteration": 4.075268268585205 }, { "auxiliary_loss_clip": 0.01089096, "auxiliary_loss_mlp": 0.01036666, "balance_loss_clip": 1.01867688, "balance_loss_mlp": 1.02672911, "epoch": 0.25263790771080713, "flos": 23366669414400.0, "grad_norm": 2.7475426529850826, "language_loss": 0.74723589, "learning_rate": 3.4028137731835492e-06, "loss": 0.76849353, "num_input_tokens_seen": 90661350, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.625, "step": 4202, "time_per_iteration": 2.4181506633758545 }, { "auxiliary_loss_clip": 0.01086922, "auxiliary_loss_mlp": 0.01039966, "balance_loss_clip": 1.02253139, "balance_loss_mlp": 1.02769208, "epoch": 0.25269803096347515, "flos": 18514056366720.0, "grad_norm": 1.9061824622614174, "language_loss": 0.73042041, "learning_rate": 3.4025444510976045e-06, "loss": 0.75168931, "num_input_tokens_seen": 90680540, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.59375, "step": 4203, "time_per_iteration": 2.3866050243377686 }, { "auxiliary_loss_clip": 0.01085818, "auxiliary_loss_mlp": 0.01034929, "balance_loss_clip": 1.01728582, "balance_loss_mlp": 1.0260551, "epoch": 0.2527581542161431, "flos": 24609332903040.0, "grad_norm": 2.1212812715222102, "language_loss": 0.77547503, "learning_rate": 3.4022750789584568e-06, "loss": 0.79668248, "num_input_tokens_seen": 90703460, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.59765625, "step": 4204, "time_per_iteration": 2.4628543853759766 }, { "auxiliary_loss_clip": 0.01086552, "auxiliary_loss_mlp": 0.01041565, "balance_loss_clip": 1.0235641, "balance_loss_mlp": 1.02526116, "epoch": 0.2528182774688111, "flos": 12640443252480.0, "grad_norm": 1.992688282780428, "language_loss": 0.72095698, "learning_rate": 3.4020056567757183e-06, "loss": 0.74223816, "num_input_tokens_seen": 90718815, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.61328125, "step": 4205, "time_per_iteration": 3.713306188583374 }, { "auxiliary_loss_clip": 0.01083553, "auxiliary_loss_mlp": 0.0103194, "balance_loss_clip": 1.01643646, "balance_loss_mlp": 1.02674294, "epoch": 0.25287840072147905, "flos": 46935032952960.0, "grad_norm": 1.311174117916942, "language_loss": 0.75730765, "learning_rate": 3.401736184559005e-06, "loss": 0.77846253, "num_input_tokens_seen": 90742125, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.5703125, "step": 4206, "time_per_iteration": 2.612483263015747 }, { "auxiliary_loss_clip": 0.0108488, "auxiliary_loss_mlp": 0.01037535, "balance_loss_clip": 1.01982069, "balance_loss_mlp": 1.02526879, "epoch": 0.252938523974147, "flos": 18878724184320.0, "grad_norm": 1.7209736363570025, "language_loss": 0.79218537, "learning_rate": 3.401466662317932e-06, "loss": 0.81340957, "num_input_tokens_seen": 90760785, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.59375, "step": 4207, "time_per_iteration": 3.774508476257324 }, { "auxiliary_loss_clip": 0.01085521, "auxiliary_loss_mlp": 0.01034666, "balance_loss_clip": 1.01836991, "balance_loss_mlp": 1.0268259, "epoch": 0.252998647226815, "flos": 21433633234560.0, "grad_norm": 1.4719136538962954, "language_loss": 0.7642712, "learning_rate": 3.4011970900621192e-06, "loss": 0.78547311, "num_input_tokens_seen": 90780045, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.5859375, "step": 4208, "time_per_iteration": 2.397585391998291 }, { "auxiliary_loss_clip": 0.01082874, "auxiliary_loss_mlp": 0.01029827, "balance_loss_clip": 1.01245856, "balance_loss_mlp": 1.0249666, "epoch": 0.25305877047948294, "flos": 25441138978560.0, "grad_norm": 2.1309049178116477, "language_loss": 0.69913232, "learning_rate": 3.400927467801186e-06, "loss": 0.72025931, "num_input_tokens_seen": 90797980, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.578125, "step": 4209, "time_per_iteration": 2.41082501411438 }, { "auxiliary_loss_clip": 0.01020904, "auxiliary_loss_mlp": 0.01007153, "balance_loss_clip": 1.00536501, "balance_loss_mlp": 1.00777543, "epoch": 0.2531188937321509, "flos": 60182335324800.0, "grad_norm": 0.7702165785040264, "language_loss": 0.55134249, "learning_rate": 3.400657795544756e-06, "loss": 0.57162297, "num_input_tokens_seen": 90864865, "router_z_loss_clip": 0.01782227, "router_z_loss_mlp": 0.13085938, "step": 4210, "time_per_iteration": 4.450890779495239 }, { "auxiliary_loss_clip": 0.01084906, "auxiliary_loss_mlp": 0.01032695, "balance_loss_clip": 1.01618409, "balance_loss_mlp": 1.02576089, "epoch": 0.25317901698481887, "flos": 19681377408000.0, "grad_norm": 2.7408645727011143, "language_loss": 0.79702961, "learning_rate": 3.400388073302452e-06, "loss": 0.8182056, "num_input_tokens_seen": 90882885, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.58984375, "step": 4211, "time_per_iteration": 2.3642923831939697 }, { "auxiliary_loss_clip": 0.01083493, "auxiliary_loss_mlp": 0.01034482, "balance_loss_clip": 1.01818037, "balance_loss_mlp": 1.02719128, "epoch": 0.25323914023748684, "flos": 24423246023040.0, "grad_norm": 1.536516128160232, "language_loss": 0.78452933, "learning_rate": 3.4001183010838995e-06, "loss": 0.80570906, "num_input_tokens_seen": 90902985, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.5625, "step": 4212, "time_per_iteration": 2.4152677059173584 }, { "auxiliary_loss_clip": 0.01085529, "auxiliary_loss_mlp": 0.01033761, "balance_loss_clip": 1.01636815, "balance_loss_mlp": 1.02613354, "epoch": 0.2532992634901548, "flos": 25446270948480.0, "grad_norm": 2.47862791972638, "language_loss": 0.53626394, "learning_rate": 3.3998484788987264e-06, "loss": 0.55745685, "num_input_tokens_seen": 90923550, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.59375, "step": 4213, "time_per_iteration": 2.415127754211426 }, { "auxiliary_loss_clip": 0.01087302, "auxiliary_loss_mlp": 0.0104273, "balance_loss_clip": 1.02405012, "balance_loss_mlp": 1.02718604, "epoch": 0.25335938674282277, "flos": 18879527145600.0, "grad_norm": 2.1809503383736786, "language_loss": 0.64555001, "learning_rate": 3.3995786067565623e-06, "loss": 0.66685027, "num_input_tokens_seen": 90943260, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.6015625, "step": 4214, "time_per_iteration": 2.403613567352295 }, { "auxiliary_loss_clip": 0.01016404, "auxiliary_loss_mlp": 0.01008317, "balance_loss_clip": 1.00636172, "balance_loss_mlp": 1.00312185, "epoch": 0.25341950999549073, "flos": 53059809588480.0, "grad_norm": 0.8423539403247228, "language_loss": 0.58051109, "learning_rate": 3.3993086846670376e-06, "loss": 0.60075825, "num_input_tokens_seen": 90996295, "router_z_loss_clip": 0.01953125, "router_z_loss_mlp": 0.1328125, "step": 4215, "time_per_iteration": 2.7951724529266357 }, { "auxiliary_loss_clip": 0.01085358, "auxiliary_loss_mlp": 0.01030724, "balance_loss_clip": 1.01349866, "balance_loss_mlp": 1.02700901, "epoch": 0.2534796332481587, "flos": 39018686567040.0, "grad_norm": 1.6232707828678796, "language_loss": 0.83765221, "learning_rate": 3.3990387126397854e-06, "loss": 0.85881305, "num_input_tokens_seen": 91017545, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.58203125, "step": 4216, "time_per_iteration": 2.5465731620788574 }, { "auxiliary_loss_clip": 0.01088035, "auxiliary_loss_mlp": 0.01033414, "balance_loss_clip": 1.01587868, "balance_loss_mlp": 1.02852917, "epoch": 0.2535397565008267, "flos": 23585854129920.0, "grad_norm": 2.016891040914528, "language_loss": 0.80153388, "learning_rate": 3.3987686906844404e-06, "loss": 0.82274836, "num_input_tokens_seen": 91037715, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.59375, "step": 4217, "time_per_iteration": 2.393773078918457 }, { "auxiliary_loss_clip": 0.01083478, "auxiliary_loss_mlp": 0.01034403, "balance_loss_clip": 1.01786864, "balance_loss_mlp": 1.02516747, "epoch": 0.2535998797534947, "flos": 19280364998400.0, "grad_norm": 2.1267848717555147, "language_loss": 0.75011122, "learning_rate": 3.398498618810639e-06, "loss": 0.77129006, "num_input_tokens_seen": 91055295, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.5859375, "step": 4218, "time_per_iteration": 2.372349739074707 }, { "auxiliary_loss_clip": 0.01086645, "auxiliary_loss_mlp": 0.01034857, "balance_loss_clip": 1.01753581, "balance_loss_mlp": 1.02585709, "epoch": 0.25366000300616265, "flos": 24023246042880.0, "grad_norm": 1.6683740752352614, "language_loss": 0.74832523, "learning_rate": 3.398228497028019e-06, "loss": 0.76954031, "num_input_tokens_seen": 91075485, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.609375, "step": 4219, "time_per_iteration": 2.4075748920440674 }, { "auxiliary_loss_clip": 0.0108855, "auxiliary_loss_mlp": 0.01042162, "balance_loss_clip": 1.02486539, "balance_loss_mlp": 1.02807808, "epoch": 0.2537201262588306, "flos": 16288448060160.0, "grad_norm": 1.708694570165322, "language_loss": 0.81267452, "learning_rate": 3.397958325346221e-06, "loss": 0.83398163, "num_input_tokens_seen": 91093620, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.6015625, "step": 4220, "time_per_iteration": 2.3648316860198975 }, { "auxiliary_loss_clip": 0.01088201, "auxiliary_loss_mlp": 0.01037211, "balance_loss_clip": 1.0197705, "balance_loss_mlp": 1.02815259, "epoch": 0.2537802495114986, "flos": 23293561294080.0, "grad_norm": 3.126759026973794, "language_loss": 0.70966601, "learning_rate": 3.397688103774886e-06, "loss": 0.7309202, "num_input_tokens_seen": 91114110, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.6015625, "step": 4221, "time_per_iteration": 2.44842267036438 }, { "auxiliary_loss_clip": 0.01085814, "auxiliary_loss_mlp": 0.0103377, "balance_loss_clip": 1.01663923, "balance_loss_mlp": 1.02692389, "epoch": 0.25384037276416654, "flos": 17638190288640.0, "grad_norm": 1.6753200340995202, "language_loss": 0.61910427, "learning_rate": 3.397417832323658e-06, "loss": 0.64030015, "num_input_tokens_seen": 91133135, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.5859375, "step": 4222, "time_per_iteration": 2.399949312210083 }, { "auxiliary_loss_clip": 0.01090904, "auxiliary_loss_mlp": 0.01039178, "balance_loss_clip": 1.02009249, "balance_loss_mlp": 1.028826, "epoch": 0.2539004960168345, "flos": 21505973304960.0, "grad_norm": 1.7557900932408543, "language_loss": 0.7456938, "learning_rate": 3.397147511002182e-06, "loss": 0.7669946, "num_input_tokens_seen": 91151805, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.62109375, "step": 4223, "time_per_iteration": 2.3815507888793945 }, { "auxiliary_loss_clip": 0.01086685, "auxiliary_loss_mlp": 0.0103843, "balance_loss_clip": 1.02089465, "balance_loss_mlp": 1.02803266, "epoch": 0.2539606192695025, "flos": 23949788808960.0, "grad_norm": 1.487638072061383, "language_loss": 0.79764968, "learning_rate": 3.3968771398201056e-06, "loss": 0.81890082, "num_input_tokens_seen": 91172270, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.5859375, "step": 4224, "time_per_iteration": 2.4221978187561035 }, { "auxiliary_loss_clip": 0.01082935, "auxiliary_loss_mlp": 0.01034567, "balance_loss_clip": 1.01698387, "balance_loss_mlp": 1.02560687, "epoch": 0.25402074252217044, "flos": 24168659322240.0, "grad_norm": 1.4043746669844293, "language_loss": 0.77372491, "learning_rate": 3.396606718787077e-06, "loss": 0.79489988, "num_input_tokens_seen": 91192080, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.5703125, "step": 4225, "time_per_iteration": 2.4019360542297363 }, { "auxiliary_loss_clip": 0.01087068, "auxiliary_loss_mlp": 0.01045849, "balance_loss_clip": 1.02768099, "balance_loss_mlp": 1.02729058, "epoch": 0.2540808657748384, "flos": 22302831242880.0, "grad_norm": 2.6425992178423963, "language_loss": 0.84850371, "learning_rate": 3.396336247912747e-06, "loss": 0.86983287, "num_input_tokens_seen": 91211450, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.59765625, "step": 4226, "time_per_iteration": 2.39050555229187 }, { "auxiliary_loss_clip": 0.01083659, "auxiliary_loss_mlp": 0.01044263, "balance_loss_clip": 1.02712059, "balance_loss_mlp": 1.02492332, "epoch": 0.25414098902750637, "flos": 27598317287040.0, "grad_norm": 1.5257596202667512, "language_loss": 0.70935285, "learning_rate": 3.396065727206768e-06, "loss": 0.73063207, "num_input_tokens_seen": 91231835, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.5859375, "step": 4227, "time_per_iteration": 2.4347763061523438 }, { "auxiliary_loss_clip": 0.01017919, "auxiliary_loss_mlp": 0.01000842, "balance_loss_clip": 0.99894607, "balance_loss_mlp": 1.00497365, "epoch": 0.25420111228017434, "flos": 58167847209600.0, "grad_norm": 0.9906463064364873, "language_loss": 0.61949646, "learning_rate": 3.395795156678795e-06, "loss": 0.63968408, "num_input_tokens_seen": 91288755, "router_z_loss_clip": 0.0189209, "router_z_loss_mlp": 0.12890625, "step": 4228, "time_per_iteration": 2.874880790710449 }, { "auxiliary_loss_clip": 0.01087209, "auxiliary_loss_mlp": 0.010317, "balance_loss_clip": 1.01291287, "balance_loss_mlp": 1.02655411, "epoch": 0.2542612355328423, "flos": 11463870700800.0, "grad_norm": 2.3320528795276307, "language_loss": 0.85919857, "learning_rate": 3.395524536338483e-06, "loss": 0.88038766, "num_input_tokens_seen": 91302485, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.60546875, "step": 4229, "time_per_iteration": 2.342778205871582 }, { "auxiliary_loss_clip": 0.01087712, "auxiliary_loss_mlp": 0.01039547, "balance_loss_clip": 1.02093852, "balance_loss_mlp": 1.0279516, "epoch": 0.2543213587855103, "flos": 22964784220800.0, "grad_norm": 2.0199073729481953, "language_loss": 0.77261305, "learning_rate": 3.3952538661954893e-06, "loss": 0.79388565, "num_input_tokens_seen": 91321120, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.59765625, "step": 4230, "time_per_iteration": 2.4160499572753906 }, { "auxiliary_loss_clip": 0.01084105, "auxiliary_loss_mlp": 0.01033837, "balance_loss_clip": 1.01538348, "balance_loss_mlp": 1.02499151, "epoch": 0.2543814820381783, "flos": 18252382659840.0, "grad_norm": 2.2727168383174248, "language_loss": 0.75731349, "learning_rate": 3.3949831462594743e-06, "loss": 0.77849293, "num_input_tokens_seen": 91338575, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.58984375, "step": 4231, "time_per_iteration": 2.355001449584961 }, { "auxiliary_loss_clip": 0.01085398, "auxiliary_loss_mlp": 0.01037483, "balance_loss_clip": 1.01989961, "balance_loss_mlp": 1.0256772, "epoch": 0.25444160529084625, "flos": 15631801608960.0, "grad_norm": 1.8001754541316275, "language_loss": 0.73925924, "learning_rate": 3.3947123765400994e-06, "loss": 0.76048803, "num_input_tokens_seen": 91357355, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.59375, "step": 4232, "time_per_iteration": 2.379716634750366 }, { "auxiliary_loss_clip": 0.01086377, "auxiliary_loss_mlp": 0.01045583, "balance_loss_clip": 1.02578199, "balance_loss_mlp": 1.02716863, "epoch": 0.2545017285435142, "flos": 24600639974400.0, "grad_norm": 1.7991696273225741, "language_loss": 0.8663975, "learning_rate": 3.394441557047028e-06, "loss": 0.88771713, "num_input_tokens_seen": 91376515, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.59375, "step": 4233, "time_per_iteration": 2.4171550273895264 }, { "auxiliary_loss_clip": 0.01080724, "auxiliary_loss_mlp": 0.01037627, "balance_loss_clip": 1.02109265, "balance_loss_mlp": 1.02476168, "epoch": 0.2545618517961822, "flos": 24677972369280.0, "grad_norm": 1.5881667641686952, "language_loss": 0.7487973, "learning_rate": 3.3941706877899236e-06, "loss": 0.76998085, "num_input_tokens_seen": 91397595, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.5625, "step": 4234, "time_per_iteration": 2.4654290676116943 }, { "auxiliary_loss_clip": 0.01086916, "auxiliary_loss_mlp": 0.01038339, "balance_loss_clip": 1.02132797, "balance_loss_mlp": 1.02555203, "epoch": 0.25462197504885015, "flos": 23913967887360.0, "grad_norm": 1.3432440338004685, "language_loss": 0.74730933, "learning_rate": 3.393899768778454e-06, "loss": 0.76856196, "num_input_tokens_seen": 91417775, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.6171875, "step": 4235, "time_per_iteration": 2.4672210216522217 }, { "auxiliary_loss_clip": 0.01091606, "auxiliary_loss_mlp": 0.01042467, "balance_loss_clip": 1.02248764, "balance_loss_mlp": 1.02715826, "epoch": 0.2546820983015181, "flos": 24788262954240.0, "grad_norm": 2.5389337739658586, "language_loss": 0.64470553, "learning_rate": 3.393628800022287e-06, "loss": 0.66604626, "num_input_tokens_seen": 91437665, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.64453125, "step": 4236, "time_per_iteration": 2.437122106552124 }, { "auxiliary_loss_clip": 0.01084494, "auxiliary_loss_mlp": 0.01033891, "balance_loss_clip": 1.01757097, "balance_loss_mlp": 1.02604151, "epoch": 0.2547422215541861, "flos": 18733136348160.0, "grad_norm": 1.7228645594771752, "language_loss": 0.66689718, "learning_rate": 3.393357781531093e-06, "loss": 0.68808103, "num_input_tokens_seen": 91456705, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.5859375, "step": 4237, "time_per_iteration": 2.3892457485198975 }, { "auxiliary_loss_clip": 0.01086732, "auxiliary_loss_mlp": 0.01039802, "balance_loss_clip": 1.02211189, "balance_loss_mlp": 1.02712703, "epoch": 0.25480234480685404, "flos": 21031398927360.0, "grad_norm": 2.1291563061300707, "language_loss": 0.75285828, "learning_rate": 3.393086713314544e-06, "loss": 0.77412361, "num_input_tokens_seen": 91475535, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.59765625, "step": 4238, "time_per_iteration": 2.3946433067321777 }, { "auxiliary_loss_clip": 0.01090295, "auxiliary_loss_mlp": 0.01039888, "balance_loss_clip": 1.02094567, "balance_loss_mlp": 1.02938437, "epoch": 0.254862468059522, "flos": 25081009637760.0, "grad_norm": 2.803089062677574, "language_loss": 0.80558288, "learning_rate": 3.3928155953823137e-06, "loss": 0.82688469, "num_input_tokens_seen": 91499140, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.609375, "step": 4239, "time_per_iteration": 2.4331414699554443 }, { "auxiliary_loss_clip": 0.0108447, "auxiliary_loss_mlp": 0.01033939, "balance_loss_clip": 1.01623678, "balance_loss_mlp": 1.02648723, "epoch": 0.25492259131219, "flos": 20557348220160.0, "grad_norm": 1.748066251671453, "language_loss": 0.77362287, "learning_rate": 3.3925444277440774e-06, "loss": 0.79480696, "num_input_tokens_seen": 91518335, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.578125, "step": 4240, "time_per_iteration": 3.7819595336914062 }, { "auxiliary_loss_clip": 0.01089638, "auxiliary_loss_mlp": 0.01035941, "balance_loss_clip": 1.01597381, "balance_loss_mlp": 1.02600455, "epoch": 0.25498271456485794, "flos": 25041418289280.0, "grad_norm": 1.719907468259334, "language_loss": 0.83467364, "learning_rate": 3.392273210409512e-06, "loss": 0.85592937, "num_input_tokens_seen": 91537655, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.6328125, "step": 4241, "time_per_iteration": 2.411308765411377 }, { "auxiliary_loss_clip": 0.01086503, "auxiliary_loss_mlp": 0.0104171, "balance_loss_clip": 1.02353048, "balance_loss_mlp": 1.02611661, "epoch": 0.2550428378175259, "flos": 26177177594880.0, "grad_norm": 1.7606285079797142, "language_loss": 0.7337594, "learning_rate": 3.392001943388298e-06, "loss": 0.75504154, "num_input_tokens_seen": 91557545, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.60546875, "step": 4242, "time_per_iteration": 2.4295384883880615 }, { "auxiliary_loss_clip": 0.01085302, "auxiliary_loss_mlp": 0.01034616, "balance_loss_clip": 1.01708043, "balance_loss_mlp": 1.0259043, "epoch": 0.2551029610701939, "flos": 15266295918720.0, "grad_norm": 2.2515804794555145, "language_loss": 0.72305548, "learning_rate": 3.3917306266901146e-06, "loss": 0.74425465, "num_input_tokens_seen": 91574405, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.59375, "step": 4243, "time_per_iteration": 2.3567705154418945 }, { "auxiliary_loss_clip": 0.0108642, "auxiliary_loss_mlp": 0.01033751, "balance_loss_clip": 1.01590562, "balance_loss_mlp": 1.0265739, "epoch": 0.2551630843228619, "flos": 18111263477760.0, "grad_norm": 1.5851356583756018, "language_loss": 0.81755608, "learning_rate": 3.3914592603246458e-06, "loss": 0.83875787, "num_input_tokens_seen": 91593755, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.59765625, "step": 4244, "time_per_iteration": 3.7711994647979736 }, { "auxiliary_loss_clip": 0.01016159, "auxiliary_loss_mlp": 0.01010972, "balance_loss_clip": 1.00915968, "balance_loss_mlp": 1.00335169, "epoch": 0.25522320757552985, "flos": 70516381651200.0, "grad_norm": 0.6901398321048748, "language_loss": 0.57706642, "learning_rate": 3.391187844301575e-06, "loss": 0.59733772, "num_input_tokens_seen": 91660335, "router_z_loss_clip": 0.01806641, "router_z_loss_mlp": 0.12890625, "step": 4245, "time_per_iteration": 3.1313469409942627 }, { "auxiliary_loss_clip": 0.01086005, "auxiliary_loss_mlp": 0.01046074, "balance_loss_clip": 1.02726281, "balance_loss_mlp": 1.02540946, "epoch": 0.2552833308281978, "flos": 22891990302720.0, "grad_norm": 3.2779920259881727, "language_loss": 0.65447509, "learning_rate": 3.3909163786305884e-06, "loss": 0.67579591, "num_input_tokens_seen": 91678500, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.609375, "step": 4246, "time_per_iteration": 3.786039352416992 }, { "auxiliary_loss_clip": 0.01081173, "auxiliary_loss_mlp": 0.01037314, "balance_loss_clip": 1.02039862, "balance_loss_mlp": 1.02552485, "epoch": 0.2553434540808658, "flos": 22052538639360.0, "grad_norm": 2.3662608062947648, "language_loss": 0.81410658, "learning_rate": 3.390644863321374e-06, "loss": 0.8352915, "num_input_tokens_seen": 91696430, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.5546875, "step": 4247, "time_per_iteration": 2.4006686210632324 }, { "auxiliary_loss_clip": 0.01090575, "auxiliary_loss_mlp": 0.0103918, "balance_loss_clip": 1.01817489, "balance_loss_mlp": 1.02558231, "epoch": 0.25540357733353375, "flos": 16543279140480.0, "grad_norm": 3.738766888506868, "language_loss": 0.83157945, "learning_rate": 3.390373298383622e-06, "loss": 0.8528769, "num_input_tokens_seen": 91713270, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 0.6484375, "step": 4248, "time_per_iteration": 2.34535551071167 }, { "auxiliary_loss_clip": 0.01087325, "auxiliary_loss_mlp": 0.01034741, "balance_loss_clip": 1.01693141, "balance_loss_mlp": 1.02715528, "epoch": 0.2554637005862017, "flos": 17564104650240.0, "grad_norm": 1.8196853331893625, "language_loss": 0.84300339, "learning_rate": 3.390101683827023e-06, "loss": 0.86422402, "num_input_tokens_seen": 91728865, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.6015625, "step": 4249, "time_per_iteration": 2.344696044921875 }, { "auxiliary_loss_clip": 0.01014514, "auxiliary_loss_mlp": 0.01004274, "balance_loss_clip": 1.0023433, "balance_loss_mlp": 1.00165677, "epoch": 0.2555238238388697, "flos": 72241650996480.0, "grad_norm": 0.769060138190621, "language_loss": 0.5633142, "learning_rate": 3.389830019661271e-06, "loss": 0.58350205, "num_input_tokens_seen": 91787470, "router_z_loss_clip": 0.01928711, "router_z_loss_mlp": 0.12890625, "step": 4250, "time_per_iteration": 4.34425950050354 }, { "auxiliary_loss_clip": 0.01086496, "auxiliary_loss_mlp": 0.01031947, "balance_loss_clip": 1.0136838, "balance_loss_mlp": 1.02591741, "epoch": 0.25558394709153764, "flos": 24388262974080.0, "grad_norm": 5.30759907953262, "language_loss": 0.80202079, "learning_rate": 3.3895583058960604e-06, "loss": 0.82320523, "num_input_tokens_seen": 91805640, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.60546875, "step": 4251, "time_per_iteration": 2.4071388244628906 }, { "auxiliary_loss_clip": 0.01014107, "auxiliary_loss_mlp": 0.01002477, "balance_loss_clip": 1.00067675, "balance_loss_mlp": 1.00154877, "epoch": 0.2556440703442056, "flos": 69227772946560.0, "grad_norm": 0.860774705426065, "language_loss": 0.66104627, "learning_rate": 3.3892865425410884e-06, "loss": 0.68121207, "num_input_tokens_seen": 91869695, "router_z_loss_clip": 0.01794434, "router_z_loss_mlp": 0.125, "step": 4252, "time_per_iteration": 3.0563292503356934 }, { "auxiliary_loss_clip": 0.01084835, "auxiliary_loss_mlp": 0.01033213, "balance_loss_clip": 1.01695287, "balance_loss_mlp": 1.02676415, "epoch": 0.2557041935968736, "flos": 24862732617600.0, "grad_norm": 3.350375536497017, "language_loss": 0.73127812, "learning_rate": 3.389014729606054e-06, "loss": 0.75245857, "num_input_tokens_seen": 91889920, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.58203125, "step": 4253, "time_per_iteration": 2.4125022888183594 }, { "auxiliary_loss_clip": 0.01086663, "auxiliary_loss_mlp": 0.01040386, "balance_loss_clip": 1.02300525, "balance_loss_mlp": 1.02779603, "epoch": 0.25576431684954154, "flos": 22491012804480.0, "grad_norm": 2.293465887887409, "language_loss": 0.72769624, "learning_rate": 3.388742867100656e-06, "loss": 0.74896675, "num_input_tokens_seen": 91908665, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.58984375, "step": 4254, "time_per_iteration": 2.400686264038086 }, { "auxiliary_loss_clip": 0.01086128, "auxiliary_loss_mlp": 0.01038194, "balance_loss_clip": 1.01901293, "balance_loss_mlp": 1.02678394, "epoch": 0.2558244401022095, "flos": 19825778257920.0, "grad_norm": 1.6383043855915715, "language_loss": 0.80807006, "learning_rate": 3.388470955034598e-06, "loss": 0.82931328, "num_input_tokens_seen": 91927855, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.59375, "step": 4255, "time_per_iteration": 2.3769102096557617 }, { "auxiliary_loss_clip": 0.01085968, "auxiliary_loss_mlp": 0.01036717, "balance_loss_clip": 1.01915765, "balance_loss_mlp": 1.02709889, "epoch": 0.2558845633548775, "flos": 23219405832960.0, "grad_norm": 1.5961840010848884, "language_loss": 0.85364938, "learning_rate": 3.3881989934175822e-06, "loss": 0.8748762, "num_input_tokens_seen": 91948500, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.58984375, "step": 4256, "time_per_iteration": 2.415609121322632 }, { "auxiliary_loss_clip": 0.01088586, "auxiliary_loss_mlp": 0.01036816, "balance_loss_clip": 1.01798713, "balance_loss_mlp": 1.02706409, "epoch": 0.2559446866075455, "flos": 16836898608000.0, "grad_norm": 2.0173160266685497, "language_loss": 0.75138247, "learning_rate": 3.387926982259316e-06, "loss": 0.77263653, "num_input_tokens_seen": 91968375, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.6171875, "step": 4257, "time_per_iteration": 2.3902125358581543 }, { "auxiliary_loss_clip": 0.01018854, "auxiliary_loss_mlp": 0.01003803, "balance_loss_clip": 1.0018599, "balance_loss_mlp": 1.00605226, "epoch": 0.25600480986021346, "flos": 57590627834880.0, "grad_norm": 0.7968367734150151, "language_loss": 0.65308678, "learning_rate": 3.387654921569505e-06, "loss": 0.67331338, "num_input_tokens_seen": 92028490, "router_z_loss_clip": 0.01940918, "router_z_loss_mlp": 0.12792969, "step": 4258, "time_per_iteration": 2.994932174682617 }, { "auxiliary_loss_clip": 0.01082901, "auxiliary_loss_mlp": 0.01031022, "balance_loss_clip": 1.015203, "balance_loss_mlp": 1.02647078, "epoch": 0.2560649331128814, "flos": 27818270052480.0, "grad_norm": 1.6186365576918889, "language_loss": 0.7640518, "learning_rate": 3.3873828113578604e-06, "loss": 0.78519106, "num_input_tokens_seen": 92048060, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.5625, "step": 4259, "time_per_iteration": 2.4414870738983154 }, { "auxiliary_loss_clip": 0.01017088, "auxiliary_loss_mlp": 0.01002281, "balance_loss_clip": 1.00038564, "balance_loss_mlp": 1.00432873, "epoch": 0.2561250563655494, "flos": 70946896026240.0, "grad_norm": 0.7973289789284315, "language_loss": 0.58468884, "learning_rate": 3.387110651634092e-06, "loss": 0.60488254, "num_input_tokens_seen": 92118180, "router_z_loss_clip": 0.0189209, "router_z_loss_mlp": 0.12695312, "step": 4260, "time_per_iteration": 3.1298787593841553 }, { "auxiliary_loss_clip": 0.01083809, "auxiliary_loss_mlp": 0.01036781, "balance_loss_clip": 1.01862574, "balance_loss_mlp": 1.02459311, "epoch": 0.25618517961821735, "flos": 27011217997440.0, "grad_norm": 1.8000391782145087, "language_loss": 0.77640504, "learning_rate": 3.3868384424079122e-06, "loss": 0.79761088, "num_input_tokens_seen": 92137570, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.59375, "step": 4261, "time_per_iteration": 2.428130626678467 }, { "auxiliary_loss_clip": 0.01083037, "auxiliary_loss_mlp": 0.01037047, "balance_loss_clip": 1.0212872, "balance_loss_mlp": 1.02597356, "epoch": 0.2562453028708853, "flos": 23067394306560.0, "grad_norm": 1.5256314587185946, "language_loss": 0.83025563, "learning_rate": 3.3865661836890356e-06, "loss": 0.85145652, "num_input_tokens_seen": 92157625, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.5703125, "step": 4262, "time_per_iteration": 2.4298012256622314 }, { "auxiliary_loss_clip": 0.01088283, "auxiliary_loss_mlp": 0.01035921, "balance_loss_clip": 1.01607275, "balance_loss_mlp": 1.02493632, "epoch": 0.2563054261235533, "flos": 15120079678080.0, "grad_norm": 2.158784763976801, "language_loss": 0.74157685, "learning_rate": 3.3862938754871786e-06, "loss": 0.76281887, "num_input_tokens_seen": 92175350, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.6328125, "step": 4263, "time_per_iteration": 2.3613972663879395 }, { "auxiliary_loss_clip": 0.0108525, "auxiliary_loss_mlp": 0.01049369, "balance_loss_clip": 1.03203559, "balance_loss_mlp": 1.02811015, "epoch": 0.25636554937622125, "flos": 27853637126400.0, "grad_norm": 1.9657386555212584, "language_loss": 0.82568431, "learning_rate": 3.3860215178120597e-06, "loss": 0.84703052, "num_input_tokens_seen": 92196070, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.5703125, "step": 4264, "time_per_iteration": 2.458491086959839 }, { "auxiliary_loss_clip": 0.01084013, "auxiliary_loss_mlp": 0.01035816, "balance_loss_clip": 1.01804233, "balance_loss_mlp": 1.02565813, "epoch": 0.2564256726288892, "flos": 28905430878720.0, "grad_norm": 1.7246676496995823, "language_loss": 0.74102837, "learning_rate": 3.385749110673398e-06, "loss": 0.7622267, "num_input_tokens_seen": 92216310, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.58203125, "step": 4265, "time_per_iteration": 2.4494590759277344 }, { "auxiliary_loss_clip": 0.01079177, "auxiliary_loss_mlp": 0.01032899, "balance_loss_clip": 1.01693678, "balance_loss_mlp": 1.02260828, "epoch": 0.2564857958815572, "flos": 18513951632640.0, "grad_norm": 1.623519102787576, "language_loss": 0.81270957, "learning_rate": 3.3854766540809143e-06, "loss": 0.8338303, "num_input_tokens_seen": 92234510, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.56640625, "step": 4266, "time_per_iteration": 2.3781354427337646 }, { "auxiliary_loss_clip": 0.01082828, "auxiliary_loss_mlp": 0.01032787, "balance_loss_clip": 1.0170517, "balance_loss_mlp": 1.02566898, "epoch": 0.25654591913422514, "flos": 25807203250560.0, "grad_norm": 1.4080021820802395, "language_loss": 0.79135948, "learning_rate": 3.3852041480443337e-06, "loss": 0.81251562, "num_input_tokens_seen": 92254070, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.5703125, "step": 4267, "time_per_iteration": 2.4150991439819336 }, { "auxiliary_loss_clip": 0.01080846, "auxiliary_loss_mlp": 0.0103577, "balance_loss_clip": 1.01916432, "balance_loss_mlp": 1.02583539, "epoch": 0.2566060423868931, "flos": 19098642038400.0, "grad_norm": 1.5835765877395969, "language_loss": 0.7891286, "learning_rate": 3.3849315925733793e-06, "loss": 0.81029481, "num_input_tokens_seen": 92275060, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.546875, "step": 4268, "time_per_iteration": 2.4076755046844482 }, { "auxiliary_loss_clip": 0.01083921, "auxiliary_loss_mlp": 0.01041253, "balance_loss_clip": 1.02471876, "balance_loss_mlp": 1.02735698, "epoch": 0.25666616563956113, "flos": 23841523082880.0, "grad_norm": 1.5222660198042288, "language_loss": 0.67860067, "learning_rate": 3.384658987677779e-06, "loss": 0.69985247, "num_input_tokens_seen": 92293610, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.5625, "step": 4269, "time_per_iteration": 2.4145662784576416 }, { "auxiliary_loss_clip": 0.01086155, "auxiliary_loss_mlp": 0.01037277, "balance_loss_clip": 1.01988482, "balance_loss_mlp": 1.02690673, "epoch": 0.2567262888922291, "flos": 14603574890880.0, "grad_norm": 2.464202610279144, "language_loss": 0.78836644, "learning_rate": 3.3843863333672617e-06, "loss": 0.80960071, "num_input_tokens_seen": 92308305, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.59375, "step": 4270, "time_per_iteration": 2.348062038421631 }, { "auxiliary_loss_clip": 0.01086721, "auxiliary_loss_mlp": 0.01037053, "balance_loss_clip": 1.01806307, "balance_loss_mlp": 1.02600288, "epoch": 0.25678641214489706, "flos": 32921839019520.0, "grad_norm": 2.483350980795855, "language_loss": 0.67900097, "learning_rate": 3.3841136296515574e-06, "loss": 0.70023876, "num_input_tokens_seen": 92329875, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.609375, "step": 4271, "time_per_iteration": 2.4771621227264404 }, { "auxiliary_loss_clip": 0.01086933, "auxiliary_loss_mlp": 0.01042427, "balance_loss_clip": 1.02485585, "balance_loss_mlp": 1.02654815, "epoch": 0.256846535397565, "flos": 24097750617600.0, "grad_norm": 1.3568710177011531, "language_loss": 0.87126815, "learning_rate": 3.3838408765403974e-06, "loss": 0.89256179, "num_input_tokens_seen": 92348780, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.60546875, "step": 4272, "time_per_iteration": 2.4306132793426514 }, { "auxiliary_loss_clip": 0.01083198, "auxiliary_loss_mlp": 0.01034773, "balance_loss_clip": 1.01661706, "balance_loss_mlp": 1.0257237, "epoch": 0.256906658650233, "flos": 19717442709120.0, "grad_norm": 1.7764758828185845, "language_loss": 0.82027292, "learning_rate": 3.3835680740435164e-06, "loss": 0.8414526, "num_input_tokens_seen": 92368175, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.57421875, "step": 4273, "time_per_iteration": 2.3793649673461914 }, { "auxiliary_loss_clip": 0.01078188, "auxiliary_loss_mlp": 0.01040335, "balance_loss_clip": 1.02480221, "balance_loss_mlp": 1.02476501, "epoch": 0.25696678190290095, "flos": 22925018315520.0, "grad_norm": 1.5924830037239621, "language_loss": 0.77150172, "learning_rate": 3.38329522217065e-06, "loss": 0.79268694, "num_input_tokens_seen": 92387755, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.53515625, "step": 4274, "time_per_iteration": 2.4188337326049805 }, { "auxiliary_loss_clip": 0.01079178, "auxiliary_loss_mlp": 0.01029395, "balance_loss_clip": 1.01338482, "balance_loss_mlp": 1.02376032, "epoch": 0.2570269051555689, "flos": 27306617944320.0, "grad_norm": 1.6610631604241781, "language_loss": 0.83655322, "learning_rate": 3.383022320931535e-06, "loss": 0.85763896, "num_input_tokens_seen": 92409850, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5546875, "step": 4275, "time_per_iteration": 2.4397976398468018 }, { "auxiliary_loss_clip": 0.01082623, "auxiliary_loss_mlp": 0.01028516, "balance_loss_clip": 1.01073003, "balance_loss_mlp": 1.02453136, "epoch": 0.2570870284082369, "flos": 27562182163200.0, "grad_norm": 1.9879882609096986, "language_loss": 0.78657633, "learning_rate": 3.3827493703359116e-06, "loss": 0.80768776, "num_input_tokens_seen": 92431250, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.58203125, "step": 4276, "time_per_iteration": 2.4634361267089844 }, { "auxiliary_loss_clip": 0.01081022, "auxiliary_loss_mlp": 0.01036053, "balance_loss_clip": 1.0200851, "balance_loss_mlp": 1.02425969, "epoch": 0.25714715166090485, "flos": 28729573027200.0, "grad_norm": 1.5803680779159166, "language_loss": 0.79060209, "learning_rate": 3.38247637039352e-06, "loss": 0.81177282, "num_input_tokens_seen": 92452065, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5703125, "step": 4277, "time_per_iteration": 2.441990613937378 }, { "auxiliary_loss_clip": 0.01079616, "auxiliary_loss_mlp": 0.01030886, "balance_loss_clip": 1.01493597, "balance_loss_mlp": 1.02376914, "epoch": 0.2572072749135728, "flos": 20115243273600.0, "grad_norm": 4.284323005793387, "language_loss": 0.78460282, "learning_rate": 3.3822033211141018e-06, "loss": 0.80570781, "num_input_tokens_seen": 92470025, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.55859375, "step": 4278, "time_per_iteration": 2.3912453651428223 }, { "auxiliary_loss_clip": 0.01084793, "auxiliary_loss_mlp": 0.01031413, "balance_loss_clip": 1.01468754, "balance_loss_mlp": 1.02562034, "epoch": 0.2572673981662408, "flos": 26029669633920.0, "grad_norm": 2.938846801497052, "language_loss": 0.74501789, "learning_rate": 3.381930222507403e-06, "loss": 0.76617998, "num_input_tokens_seen": 92489825, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.59375, "step": 4279, "time_per_iteration": 2.410918951034546 }, { "auxiliary_loss_clip": 0.01081741, "auxiliary_loss_mlp": 0.01041469, "balance_loss_clip": 1.02433884, "balance_loss_mlp": 1.02301311, "epoch": 0.25732752141890874, "flos": 16105712670720.0, "grad_norm": 2.87928010463899, "language_loss": 0.85400975, "learning_rate": 3.3816570745831696e-06, "loss": 0.87524188, "num_input_tokens_seen": 92507270, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.5859375, "step": 4280, "time_per_iteration": 3.738787889480591 }, { "auxiliary_loss_clip": 0.01083485, "auxiliary_loss_mlp": 0.01031873, "balance_loss_clip": 1.01500511, "balance_loss_mlp": 1.02460313, "epoch": 0.2573876446715767, "flos": 22523447324160.0, "grad_norm": 2.8416407300649067, "language_loss": 0.78913325, "learning_rate": 3.3813838773511496e-06, "loss": 0.81028682, "num_input_tokens_seen": 92526300, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.5859375, "step": 4281, "time_per_iteration": 2.3913938999176025 }, { "auxiliary_loss_clip": 0.01082384, "auxiliary_loss_mlp": 0.01032961, "balance_loss_clip": 1.01530564, "balance_loss_mlp": 1.02549577, "epoch": 0.2574477679242447, "flos": 23949718986240.0, "grad_norm": 1.6785965482775702, "language_loss": 0.87130249, "learning_rate": 3.3811106308210916e-06, "loss": 0.89245594, "num_input_tokens_seen": 92546465, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.5703125, "step": 4282, "time_per_iteration": 2.4007761478424072 }, { "auxiliary_loss_clip": 0.01084002, "auxiliary_loss_mlp": 0.01034894, "balance_loss_clip": 1.01863384, "balance_loss_mlp": 1.02449942, "epoch": 0.2575078911769127, "flos": 21980617505280.0, "grad_norm": 1.4833750982305345, "language_loss": 0.7042622, "learning_rate": 3.380837335002748e-06, "loss": 0.72545123, "num_input_tokens_seen": 92567260, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.59375, "step": 4283, "time_per_iteration": 3.8296406269073486 }, { "auxiliary_loss_clip": 0.0108106, "auxiliary_loss_mlp": 0.01032604, "balance_loss_clip": 1.01710677, "balance_loss_mlp": 1.02648377, "epoch": 0.25756801442958066, "flos": 21944307824640.0, "grad_norm": 1.657453372544856, "language_loss": 0.80645716, "learning_rate": 3.380563989905872e-06, "loss": 0.8275938, "num_input_tokens_seen": 92585425, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.546875, "step": 4284, "time_per_iteration": 2.3946008682250977 }, { "auxiliary_loss_clip": 0.01081427, "auxiliary_loss_mlp": 0.01029221, "balance_loss_clip": 1.01440346, "balance_loss_mlp": 1.02537215, "epoch": 0.2576281376822486, "flos": 35260530819840.0, "grad_norm": 2.1585192943874416, "language_loss": 0.69971889, "learning_rate": 3.3802905955402185e-06, "loss": 0.72082543, "num_input_tokens_seen": 92604770, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.55859375, "step": 4285, "time_per_iteration": 3.8747050762176514 }, { "auxiliary_loss_clip": 0.01083203, "auxiliary_loss_mlp": 0.01030471, "balance_loss_clip": 1.01509285, "balance_loss_mlp": 1.02660656, "epoch": 0.2576882609349166, "flos": 14131549042560.0, "grad_norm": 1.759252741359028, "language_loss": 0.58124995, "learning_rate": 3.3800171519155443e-06, "loss": 0.60238665, "num_input_tokens_seen": 92622635, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.5625, "step": 4286, "time_per_iteration": 2.3480701446533203 }, { "auxiliary_loss_clip": 0.01087779, "auxiliary_loss_mlp": 0.01040366, "balance_loss_clip": 1.02334321, "balance_loss_mlp": 1.02740264, "epoch": 0.25774838418758456, "flos": 23257216702080.0, "grad_norm": 2.6005133627988863, "language_loss": 0.64120221, "learning_rate": 3.379743659041607e-06, "loss": 0.66248363, "num_input_tokens_seen": 92642960, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.60546875, "step": 4287, "time_per_iteration": 2.4109575748443604 }, { "auxiliary_loss_clip": 0.01085027, "auxiliary_loss_mlp": 0.01035588, "balance_loss_clip": 1.01774263, "balance_loss_mlp": 1.02515745, "epoch": 0.2578085074402525, "flos": 22600640073600.0, "grad_norm": 1.7000196163437455, "language_loss": 0.717278, "learning_rate": 3.3794701169281686e-06, "loss": 0.73848414, "num_input_tokens_seen": 92662455, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.6015625, "step": 4288, "time_per_iteration": 2.376340389251709 }, { "auxiliary_loss_clip": 0.01079433, "auxiliary_loss_mlp": 0.01031555, "balance_loss_clip": 1.01621902, "balance_loss_mlp": 1.02430677, "epoch": 0.2578686306929205, "flos": 24570684161280.0, "grad_norm": 1.4056603383260875, "language_loss": 0.76661074, "learning_rate": 3.37919652558499e-06, "loss": 0.78772056, "num_input_tokens_seen": 92683520, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.55078125, "step": 4289, "time_per_iteration": 2.418536901473999 }, { "auxiliary_loss_clip": 0.0108003, "auxiliary_loss_mlp": 0.01032842, "balance_loss_clip": 1.01683259, "balance_loss_mlp": 1.02402079, "epoch": 0.25792875394558845, "flos": 18112974134400.0, "grad_norm": 62.9597934246925, "language_loss": 0.85113913, "learning_rate": 3.3789228850218347e-06, "loss": 0.87226784, "num_input_tokens_seen": 92701450, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5625, "step": 4290, "time_per_iteration": 3.752499580383301 }, { "auxiliary_loss_clip": 0.01083866, "auxiliary_loss_mlp": 0.01038385, "balance_loss_clip": 1.01965714, "balance_loss_mlp": 1.02597737, "epoch": 0.2579888771982564, "flos": 17711926813440.0, "grad_norm": 1.761805413096687, "language_loss": 0.72238749, "learning_rate": 3.3786491952484686e-06, "loss": 0.74360996, "num_input_tokens_seen": 92720355, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.58203125, "step": 4291, "time_per_iteration": 2.3929483890533447 }, { "auxiliary_loss_clip": 0.0108336, "auxiliary_loss_mlp": 0.01034549, "balance_loss_clip": 1.01665545, "balance_loss_mlp": 1.02428102, "epoch": 0.2580490004509244, "flos": 16433966073600.0, "grad_norm": 2.5180673896506716, "language_loss": 0.80971766, "learning_rate": 3.378375456274659e-06, "loss": 0.83089674, "num_input_tokens_seen": 92736755, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.58984375, "step": 4292, "time_per_iteration": 2.368997573852539 }, { "auxiliary_loss_clip": 0.01084007, "auxiliary_loss_mlp": 0.01037285, "balance_loss_clip": 1.02002382, "balance_loss_mlp": 1.02616954, "epoch": 0.25810912370359235, "flos": 33833840221440.0, "grad_norm": 2.157634695913549, "language_loss": 0.67968988, "learning_rate": 3.378101668110175e-06, "loss": 0.70090276, "num_input_tokens_seen": 92757655, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.578125, "step": 4293, "time_per_iteration": 2.503197431564331 }, { "auxiliary_loss_clip": 0.01077826, "auxiliary_loss_mlp": 0.01030119, "balance_loss_clip": 1.01525354, "balance_loss_mlp": 1.02417076, "epoch": 0.2581692469562603, "flos": 25191020931840.0, "grad_norm": 1.8207685713190493, "language_loss": 0.75422269, "learning_rate": 3.377827830764788e-06, "loss": 0.77530217, "num_input_tokens_seen": 92776100, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.53515625, "step": 4294, "time_per_iteration": 2.422917366027832 }, { "auxiliary_loss_clip": 0.01080427, "auxiliary_loss_mlp": 0.01034352, "balance_loss_clip": 1.01683974, "balance_loss_mlp": 1.0239749, "epoch": 0.2582293702089283, "flos": 34930811139840.0, "grad_norm": 2.302495236749481, "language_loss": 0.80801058, "learning_rate": 3.3775539442482695e-06, "loss": 0.82915831, "num_input_tokens_seen": 92798880, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.56640625, "step": 4295, "time_per_iteration": 2.4947853088378906 }, { "auxiliary_loss_clip": 0.01085873, "auxiliary_loss_mlp": 0.01037403, "balance_loss_clip": 1.01964092, "balance_loss_mlp": 1.02579057, "epoch": 0.2582894934615963, "flos": 26832532325760.0, "grad_norm": 1.9080965762459037, "language_loss": 0.72517002, "learning_rate": 3.377280008570394e-06, "loss": 0.74640274, "num_input_tokens_seen": 92817750, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.6015625, "step": 4296, "time_per_iteration": 2.4470784664154053 }, { "auxiliary_loss_clip": 0.01084967, "auxiliary_loss_mlp": 0.01034937, "balance_loss_clip": 1.01725817, "balance_loss_mlp": 1.02626336, "epoch": 0.25834961671426426, "flos": 23514072641280.0, "grad_norm": 2.297984378339985, "language_loss": 0.87064862, "learning_rate": 3.3770060237409382e-06, "loss": 0.89184773, "num_input_tokens_seen": 92837995, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.5859375, "step": 4297, "time_per_iteration": 2.4318974018096924 }, { "auxiliary_loss_clip": 0.01083034, "auxiliary_loss_mlp": 0.01041631, "balance_loss_clip": 1.02636003, "balance_loss_mlp": 1.02587152, "epoch": 0.25840973996693223, "flos": 22450059912960.0, "grad_norm": 1.6756887965074723, "language_loss": 0.84725773, "learning_rate": 3.3767319897696795e-06, "loss": 0.86850429, "num_input_tokens_seen": 92857245, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.5703125, "step": 4298, "time_per_iteration": 2.3790769577026367 }, { "auxiliary_loss_clip": 0.01082709, "auxiliary_loss_mlp": 0.01031622, "balance_loss_clip": 1.01518261, "balance_loss_mlp": 1.02583182, "epoch": 0.2584698632196002, "flos": 11290072619520.0, "grad_norm": 2.024330256562373, "language_loss": 0.83507544, "learning_rate": 3.376457906666397e-06, "loss": 0.85621876, "num_input_tokens_seen": 92873265, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.5703125, "step": 4299, "time_per_iteration": 2.352186918258667 }, { "auxiliary_loss_clip": 0.01079615, "auxiliary_loss_mlp": 0.01031882, "balance_loss_clip": 1.01738024, "balance_loss_mlp": 1.02467215, "epoch": 0.25852998647226816, "flos": 17929051758720.0, "grad_norm": 1.9727076145250175, "language_loss": 0.82848322, "learning_rate": 3.3761837744408728e-06, "loss": 0.84959817, "num_input_tokens_seen": 92890880, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.55078125, "step": 4300, "time_per_iteration": 2.355560541152954 }, { "auxiliary_loss_clip": 0.01082258, "auxiliary_loss_mlp": 0.01035253, "balance_loss_clip": 1.01826572, "balance_loss_mlp": 1.0248692, "epoch": 0.2585901097249361, "flos": 33254700721920.0, "grad_norm": 1.7783040865562103, "language_loss": 0.67306131, "learning_rate": 3.375909593102889e-06, "loss": 0.6942364, "num_input_tokens_seen": 92910770, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.57421875, "step": 4301, "time_per_iteration": 2.46988582611084 }, { "auxiliary_loss_clip": 0.01085816, "auxiliary_loss_mlp": 0.01034949, "balance_loss_clip": 1.01676941, "balance_loss_mlp": 1.02482903, "epoch": 0.2586502329776041, "flos": 18440319841920.0, "grad_norm": 3.446607948280226, "language_loss": 0.80717486, "learning_rate": 3.3756353626622325e-06, "loss": 0.82838249, "num_input_tokens_seen": 92929520, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.609375, "step": 4302, "time_per_iteration": 2.3469271659851074 }, { "auxiliary_loss_clip": 0.01084192, "auxiliary_loss_mlp": 0.01035901, "balance_loss_clip": 1.01944971, "balance_loss_mlp": 1.0264796, "epoch": 0.25871035623027205, "flos": 17967141918720.0, "grad_norm": 1.760854526492328, "language_loss": 0.92042071, "learning_rate": 3.375361083128687e-06, "loss": 0.94162166, "num_input_tokens_seen": 92947890, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.578125, "step": 4303, "time_per_iteration": 2.3587801456451416 }, { "auxiliary_loss_clip": 0.01082902, "auxiliary_loss_mlp": 0.01032423, "balance_loss_clip": 1.01561487, "balance_loss_mlp": 1.02608013, "epoch": 0.25877047948294, "flos": 27776618933760.0, "grad_norm": 1.7360749554289387, "language_loss": 0.67793036, "learning_rate": 3.3750867545120434e-06, "loss": 0.69908363, "num_input_tokens_seen": 92967690, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.5703125, "step": 4304, "time_per_iteration": 2.4346425533294678 }, { "auxiliary_loss_clip": 0.01084052, "auxiliary_loss_mlp": 0.01042595, "balance_loss_clip": 1.02508342, "balance_loss_mlp": 1.02554274, "epoch": 0.258830602735608, "flos": 27124615693440.0, "grad_norm": 2.5129854023402016, "language_loss": 0.72535753, "learning_rate": 3.3748123768220902e-06, "loss": 0.74662399, "num_input_tokens_seen": 92986830, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.5859375, "step": 4305, "time_per_iteration": 2.425720691680908 }, { "auxiliary_loss_clip": 0.01082325, "auxiliary_loss_mlp": 0.01032634, "balance_loss_clip": 1.01633763, "balance_loss_mlp": 1.02396238, "epoch": 0.25889072598827595, "flos": 17890612485120.0, "grad_norm": 1.9504844839412772, "language_loss": 0.75319511, "learning_rate": 3.3745379500686197e-06, "loss": 0.77434468, "num_input_tokens_seen": 93002740, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.5859375, "step": 4306, "time_per_iteration": 2.3431077003479004 }, { "auxiliary_loss_clip": 0.01021114, "auxiliary_loss_mlp": 0.01007076, "balance_loss_clip": 1.00535893, "balance_loss_mlp": 1.00742817, "epoch": 0.2589508492409439, "flos": 53932184530560.0, "grad_norm": 0.8516271944646232, "language_loss": 0.57163322, "learning_rate": 3.3742634742614256e-06, "loss": 0.59191501, "num_input_tokens_seen": 93058645, "router_z_loss_clip": 0.01721191, "router_z_loss_mlp": 0.13671875, "step": 4307, "time_per_iteration": 2.960516929626465 }, { "auxiliary_loss_clip": 0.01080167, "auxiliary_loss_mlp": 0.0102746, "balance_loss_clip": 1.01196325, "balance_loss_mlp": 1.02441955, "epoch": 0.2590109724936119, "flos": 22124739064320.0, "grad_norm": 1.473653396558904, "language_loss": 0.71911383, "learning_rate": 3.373988949410303e-06, "loss": 0.74019015, "num_input_tokens_seen": 93077140, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.55859375, "step": 4308, "time_per_iteration": 2.3743488788604736 }, { "auxiliary_loss_clip": 0.01083955, "auxiliary_loss_mlp": 0.0103448, "balance_loss_clip": 1.01788592, "balance_loss_mlp": 1.02536333, "epoch": 0.2590710957462799, "flos": 13473610871040.0, "grad_norm": 1.7904500554462124, "language_loss": 0.84118432, "learning_rate": 3.3737143755250488e-06, "loss": 0.8623687, "num_input_tokens_seen": 93093580, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.5859375, "step": 4309, "time_per_iteration": 2.355070114135742 }, { "auxiliary_loss_clip": 0.01082653, "auxiliary_loss_mlp": 0.01037472, "balance_loss_clip": 1.0214262, "balance_loss_mlp": 1.02643061, "epoch": 0.25913121899894787, "flos": 22306077999360.0, "grad_norm": 1.4843665105559463, "language_loss": 0.8458032, "learning_rate": 3.3734397526154626e-06, "loss": 0.86700445, "num_input_tokens_seen": 93112345, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5625, "step": 4310, "time_per_iteration": 2.380340099334717 }, { "auxiliary_loss_clip": 0.0108319, "auxiliary_loss_mlp": 0.0103013, "balance_loss_clip": 1.01346493, "balance_loss_mlp": 1.02490306, "epoch": 0.25919134225161583, "flos": 25810554741120.0, "grad_norm": 1.6865712353990208, "language_loss": 0.7702111, "learning_rate": 3.373165080691344e-06, "loss": 0.79134429, "num_input_tokens_seen": 93131545, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.58203125, "step": 4311, "time_per_iteration": 2.4091074466705322 }, { "auxiliary_loss_clip": 0.01081977, "auxiliary_loss_mlp": 0.01033583, "balance_loss_clip": 1.01733518, "balance_loss_mlp": 1.02419209, "epoch": 0.2592514655042838, "flos": 31210920109440.0, "grad_norm": 1.6445433309728497, "language_loss": 0.72107434, "learning_rate": 3.3728903597624967e-06, "loss": 0.74222994, "num_input_tokens_seen": 93150730, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.578125, "step": 4312, "time_per_iteration": 2.4867470264434814 }, { "auxiliary_loss_clip": 0.01081404, "auxiliary_loss_mlp": 0.01032719, "balance_loss_clip": 1.01629162, "balance_loss_mlp": 1.02468252, "epoch": 0.25931158875695176, "flos": 18474115904640.0, "grad_norm": 1.6707254952935937, "language_loss": 0.69457316, "learning_rate": 3.372615589838724e-06, "loss": 0.7157144, "num_input_tokens_seen": 93167895, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.56640625, "step": 4313, "time_per_iteration": 2.34779953956604 }, { "auxiliary_loss_clip": 0.01080702, "auxiliary_loss_mlp": 0.01033108, "balance_loss_clip": 1.01840973, "balance_loss_mlp": 1.02468693, "epoch": 0.2593717120096197, "flos": 19206942675840.0, "grad_norm": 1.5203121298846591, "language_loss": 0.80340791, "learning_rate": 3.3723407709298314e-06, "loss": 0.8245461, "num_input_tokens_seen": 93187650, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.55859375, "step": 4314, "time_per_iteration": 2.3789196014404297 }, { "auxiliary_loss_clip": 0.01084503, "auxiliary_loss_mlp": 0.01037627, "balance_loss_clip": 1.0201509, "balance_loss_mlp": 1.0246979, "epoch": 0.2594318352622877, "flos": 31246775942400.0, "grad_norm": 2.262450722488703, "language_loss": 0.67424631, "learning_rate": 3.3720659030456262e-06, "loss": 0.69546759, "num_input_tokens_seen": 93207370, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.59765625, "step": 4315, "time_per_iteration": 2.4545397758483887 }, { "auxiliary_loss_clip": 0.0107976, "auxiliary_loss_mlp": 0.01031527, "balance_loss_clip": 1.01591063, "balance_loss_mlp": 1.02369475, "epoch": 0.25949195851495566, "flos": 22236042078720.0, "grad_norm": 1.4924082349050867, "language_loss": 0.79146779, "learning_rate": 3.371790986195919e-06, "loss": 0.8125807, "num_input_tokens_seen": 93227925, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.5625, "step": 4316, "time_per_iteration": 2.390685558319092 }, { "auxiliary_loss_clip": 0.01081859, "auxiliary_loss_mlp": 0.01035054, "balance_loss_clip": 1.01837635, "balance_loss_mlp": 1.02379608, "epoch": 0.2595520817676236, "flos": 28074427764480.0, "grad_norm": 1.4888116581640165, "language_loss": 0.77716893, "learning_rate": 3.37151602039052e-06, "loss": 0.79833806, "num_input_tokens_seen": 93250020, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.578125, "step": 4317, "time_per_iteration": 2.4294650554656982 }, { "auxiliary_loss_clip": 0.01084033, "auxiliary_loss_mlp": 0.01044569, "balance_loss_clip": 1.02638912, "balance_loss_mlp": 1.02693534, "epoch": 0.2596122050202916, "flos": 20189992227840.0, "grad_norm": 1.9285170049777474, "language_loss": 0.78200823, "learning_rate": 3.3712410056392418e-06, "loss": 0.8032943, "num_input_tokens_seen": 93269070, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.5703125, "step": 4318, "time_per_iteration": 2.3697478771209717 }, { "auxiliary_loss_clip": 0.01078955, "auxiliary_loss_mlp": 0.01029743, "balance_loss_clip": 1.01239765, "balance_loss_mlp": 1.02347076, "epoch": 0.25967232827295955, "flos": 22526868637440.0, "grad_norm": 1.673626722566245, "language_loss": 0.76285136, "learning_rate": 3.3709659419518994e-06, "loss": 0.78393841, "num_input_tokens_seen": 93290250, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.5546875, "step": 4319, "time_per_iteration": 2.406689405441284 }, { "auxiliary_loss_clip": 0.01077989, "auxiliary_loss_mlp": 0.01033653, "balance_loss_clip": 1.01764274, "balance_loss_mlp": 1.0242908, "epoch": 0.2597324515256275, "flos": 21067219848960.0, "grad_norm": 1.548263438035279, "language_loss": 0.76447415, "learning_rate": 3.3706908293383095e-06, "loss": 0.78559065, "num_input_tokens_seen": 93310090, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.53515625, "step": 4320, "time_per_iteration": 3.782745838165283 }, { "auxiliary_loss_clip": 0.0108315, "auxiliary_loss_mlp": 0.01033877, "balance_loss_clip": 1.01718819, "balance_loss_mlp": 1.02675653, "epoch": 0.2597925747782955, "flos": 22049047503360.0, "grad_norm": 1.5114599799288724, "language_loss": 0.71181488, "learning_rate": 3.37041566780829e-06, "loss": 0.73298526, "num_input_tokens_seen": 93329570, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.5625, "step": 4321, "time_per_iteration": 2.4091897010803223 }, { "auxiliary_loss_clip": 0.01085197, "auxiliary_loss_mlp": 0.01037141, "balance_loss_clip": 1.02032089, "balance_loss_mlp": 1.02483368, "epoch": 0.2598526980309635, "flos": 19535929217280.0, "grad_norm": 1.8954433697780415, "language_loss": 0.74305975, "learning_rate": 3.3701404573716597e-06, "loss": 0.76428312, "num_input_tokens_seen": 93347920, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.6015625, "step": 4322, "time_per_iteration": 2.3573434352874756 }, { "auxiliary_loss_clip": 0.0108302, "auxiliary_loss_mlp": 0.01035727, "balance_loss_clip": 1.01884699, "balance_loss_mlp": 1.02527928, "epoch": 0.25991282128363147, "flos": 24494154727680.0, "grad_norm": 2.23228778176635, "language_loss": 0.74200404, "learning_rate": 3.3698651980382417e-06, "loss": 0.76319158, "num_input_tokens_seen": 93367145, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.578125, "step": 4323, "time_per_iteration": 3.823972225189209 }, { "auxiliary_loss_clip": 0.0108677, "auxiliary_loss_mlp": 0.01039805, "balance_loss_clip": 1.02110124, "balance_loss_mlp": 1.02496362, "epoch": 0.25997294453629943, "flos": 24200465437440.0, "grad_norm": 2.0590544925981082, "language_loss": 0.66615325, "learning_rate": 3.3695898898178573e-06, "loss": 0.68741906, "num_input_tokens_seen": 93386555, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.6171875, "step": 4324, "time_per_iteration": 2.414278030395508 }, { "auxiliary_loss_clip": 0.01081461, "auxiliary_loss_mlp": 0.01033286, "balance_loss_clip": 1.01802182, "balance_loss_mlp": 1.02527416, "epoch": 0.2600330677889674, "flos": 31430104824960.0, "grad_norm": 1.9668820867681012, "language_loss": 0.71053094, "learning_rate": 3.3693145327203336e-06, "loss": 0.73167843, "num_input_tokens_seen": 93405590, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.5625, "step": 4325, "time_per_iteration": 3.812192440032959 }, { "auxiliary_loss_clip": 0.01080737, "auxiliary_loss_mlp": 0.0103104, "balance_loss_clip": 1.01441014, "balance_loss_mlp": 1.023157, "epoch": 0.26009319104163536, "flos": 32265262391040.0, "grad_norm": 1.7332947526171827, "language_loss": 0.72819197, "learning_rate": 3.3690391267554963e-06, "loss": 0.74930972, "num_input_tokens_seen": 93424750, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.57421875, "step": 4326, "time_per_iteration": 2.4504213333129883 }, { "auxiliary_loss_clip": 0.01078319, "auxiliary_loss_mlp": 0.01032779, "balance_loss_clip": 1.01828325, "balance_loss_mlp": 1.02417922, "epoch": 0.26015331429430333, "flos": 26285548055040.0, "grad_norm": 1.7490654353053325, "language_loss": 0.8679921, "learning_rate": 3.3687636719331744e-06, "loss": 0.88910306, "num_input_tokens_seen": 93443465, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.5390625, "step": 4327, "time_per_iteration": 2.418240547180176 }, { "auxiliary_loss_clip": 0.01085405, "auxiliary_loss_mlp": 0.01041617, "balance_loss_clip": 1.02375913, "balance_loss_mlp": 1.02679777, "epoch": 0.2602134375469713, "flos": 21141270576000.0, "grad_norm": 1.4251562769475314, "language_loss": 0.801377, "learning_rate": 3.368488168263198e-06, "loss": 0.82264721, "num_input_tokens_seen": 93462580, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.5859375, "step": 4328, "time_per_iteration": 2.38944673538208 }, { "auxiliary_loss_clip": 0.01079567, "auxiliary_loss_mlp": 0.01036565, "balance_loss_clip": 1.02085328, "balance_loss_mlp": 1.02369261, "epoch": 0.26027356079963926, "flos": 25920147098880.0, "grad_norm": 1.4861544983352568, "language_loss": 0.87905395, "learning_rate": 3.3682126157553983e-06, "loss": 0.90021527, "num_input_tokens_seen": 93482790, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.55859375, "step": 4329, "time_per_iteration": 3.8182191848754883 }, { "auxiliary_loss_clip": 0.01079179, "auxiliary_loss_mlp": 0.01032644, "balance_loss_clip": 1.0176115, "balance_loss_mlp": 1.02456927, "epoch": 0.2603336840523072, "flos": 26358027770880.0, "grad_norm": 1.9163600045187557, "language_loss": 0.77740896, "learning_rate": 3.3679370144196106e-06, "loss": 0.79852718, "num_input_tokens_seen": 93498795, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.546875, "step": 4330, "time_per_iteration": 2.404296636581421 }, { "auxiliary_loss_clip": 0.0108491, "auxiliary_loss_mlp": 0.01032365, "balance_loss_clip": 1.01608062, "balance_loss_mlp": 1.02601981, "epoch": 0.2603938073049752, "flos": 23512536541440.0, "grad_norm": 1.521642229427162, "language_loss": 0.75395083, "learning_rate": 3.367661364265669e-06, "loss": 0.77512348, "num_input_tokens_seen": 93518335, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.5859375, "step": 4331, "time_per_iteration": 2.4064319133758545 }, { "auxiliary_loss_clip": 0.01079989, "auxiliary_loss_mlp": 0.01028631, "balance_loss_clip": 1.01396835, "balance_loss_mlp": 1.02551913, "epoch": 0.26045393055764315, "flos": 25373127916800.0, "grad_norm": 1.3703309693452361, "language_loss": 0.690584, "learning_rate": 3.367385665303412e-06, "loss": 0.71167016, "num_input_tokens_seen": 93539170, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.546875, "step": 4332, "time_per_iteration": 2.4051527976989746 }, { "auxiliary_loss_clip": 0.01082459, "auxiliary_loss_mlp": 0.01035422, "balance_loss_clip": 1.01947129, "balance_loss_mlp": 1.02486897, "epoch": 0.2605140538103111, "flos": 27634068385920.0, "grad_norm": 1.9334433825288309, "language_loss": 0.79419458, "learning_rate": 3.3671099175426773e-06, "loss": 0.81537342, "num_input_tokens_seen": 93558480, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.57421875, "step": 4333, "time_per_iteration": 2.426959753036499 }, { "auxiliary_loss_clip": 0.01080782, "auxiliary_loss_mlp": 0.01034208, "balance_loss_clip": 1.01866937, "balance_loss_mlp": 1.02513885, "epoch": 0.2605741770629791, "flos": 13769045729280.0, "grad_norm": 1.8848828748133224, "language_loss": 0.80427253, "learning_rate": 3.366834120993307e-06, "loss": 0.82542241, "num_input_tokens_seen": 93575220, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.55859375, "step": 4334, "time_per_iteration": 2.33345627784729 }, { "auxiliary_loss_clip": 0.01081458, "auxiliary_loss_mlp": 0.01032849, "balance_loss_clip": 1.01612377, "balance_loss_mlp": 1.02395833, "epoch": 0.26063430031564705, "flos": 26030472595200.0, "grad_norm": 1.868475617096109, "language_loss": 0.79615092, "learning_rate": 3.3665582756651424e-06, "loss": 0.817294, "num_input_tokens_seen": 93597015, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.57421875, "step": 4335, "time_per_iteration": 2.4325897693634033 }, { "auxiliary_loss_clip": 0.01018495, "auxiliary_loss_mlp": 0.01000548, "balance_loss_clip": 0.99874836, "balance_loss_mlp": 1.00523376, "epoch": 0.26069442356831507, "flos": 62440587619200.0, "grad_norm": 0.854688124413243, "language_loss": 0.60818154, "learning_rate": 3.366282381568028e-06, "loss": 0.62837195, "num_input_tokens_seen": 93657775, "router_z_loss_clip": 0.01794434, "router_z_loss_mlp": 0.1328125, "step": 4336, "time_per_iteration": 3.009134531021118 }, { "auxiliary_loss_clip": 0.01081802, "auxiliary_loss_mlp": 0.01038724, "balance_loss_clip": 1.0222609, "balance_loss_mlp": 1.02470827, "epoch": 0.26075454682098304, "flos": 13625517663360.0, "grad_norm": 1.9892896970213614, "language_loss": 0.76825356, "learning_rate": 3.3660064387118104e-06, "loss": 0.78945875, "num_input_tokens_seen": 93676145, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.5703125, "step": 4337, "time_per_iteration": 2.3651371002197266 }, { "auxiliary_loss_clip": 0.01084815, "auxiliary_loss_mlp": 0.01033032, "balance_loss_clip": 1.01597285, "balance_loss_mlp": 1.02653301, "epoch": 0.260814670073651, "flos": 12125823678720.0, "grad_norm": 2.0704255675615615, "language_loss": 0.74591124, "learning_rate": 3.3657304471063363e-06, "loss": 0.76708972, "num_input_tokens_seen": 93692480, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.5859375, "step": 4338, "time_per_iteration": 2.338366746902466 }, { "auxiliary_loss_clip": 0.01088111, "auxiliary_loss_mlp": 0.01037521, "balance_loss_clip": 1.0206883, "balance_loss_mlp": 1.02748883, "epoch": 0.26087479332631897, "flos": 15121615777920.0, "grad_norm": 4.316681840861605, "language_loss": 0.80428994, "learning_rate": 3.3654544067614557e-06, "loss": 0.82554621, "num_input_tokens_seen": 93710165, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.60546875, "step": 4339, "time_per_iteration": 2.3600778579711914 }, { "auxiliary_loss_clip": 0.010831, "auxiliary_loss_mlp": 0.01037218, "balance_loss_clip": 1.02179277, "balance_loss_mlp": 1.02638113, "epoch": 0.26093491657898693, "flos": 24679787760000.0, "grad_norm": 1.8164692196083743, "language_loss": 0.76641595, "learning_rate": 3.36517831768702e-06, "loss": 0.78761917, "num_input_tokens_seen": 93730185, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.56640625, "step": 4340, "time_per_iteration": 2.4018912315368652 }, { "auxiliary_loss_clip": 0.0108625, "auxiliary_loss_mlp": 0.01036358, "balance_loss_clip": 1.02020526, "balance_loss_mlp": 1.02616417, "epoch": 0.2609950398316549, "flos": 25115050080000.0, "grad_norm": 1.4591585974620058, "language_loss": 0.82838297, "learning_rate": 3.3649021798928813e-06, "loss": 0.84960902, "num_input_tokens_seen": 93747690, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.6015625, "step": 4341, "time_per_iteration": 2.3946807384490967 }, { "auxiliary_loss_clip": 0.01083036, "auxiliary_loss_mlp": 0.01037099, "balance_loss_clip": 1.01970696, "balance_loss_mlp": 1.0253309, "epoch": 0.26105516308432286, "flos": 28547326396800.0, "grad_norm": 3.6554276347206565, "language_loss": 0.76271361, "learning_rate": 3.364625993388895e-06, "loss": 0.78391492, "num_input_tokens_seen": 93767405, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.578125, "step": 4342, "time_per_iteration": 2.430886745452881 }, { "auxiliary_loss_clip": 0.01081981, "auxiliary_loss_mlp": 0.01029674, "balance_loss_clip": 1.01296103, "balance_loss_mlp": 1.02477884, "epoch": 0.2611152863369908, "flos": 39529046954880.0, "grad_norm": 1.7236296549883814, "language_loss": 0.66337711, "learning_rate": 3.364349758184917e-06, "loss": 0.68449366, "num_input_tokens_seen": 93789950, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.5703125, "step": 4343, "time_per_iteration": 2.5204410552978516 }, { "auxiliary_loss_clip": 0.01084426, "auxiliary_loss_mlp": 0.01036005, "balance_loss_clip": 1.01945925, "balance_loss_mlp": 1.02507138, "epoch": 0.2611754095896588, "flos": 13734481616640.0, "grad_norm": 1.7901669156582372, "language_loss": 0.73423326, "learning_rate": 3.3640734742908066e-06, "loss": 0.75543761, "num_input_tokens_seen": 93807835, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.59375, "step": 4344, "time_per_iteration": 2.355929374694824 }, { "auxiliary_loss_clip": 0.01084147, "auxiliary_loss_mlp": 0.0103658, "balance_loss_clip": 1.0190804, "balance_loss_mlp": 1.02618527, "epoch": 0.26123553284232676, "flos": 21505589280000.0, "grad_norm": 2.259646251158903, "language_loss": 0.86677957, "learning_rate": 3.3637971417164213e-06, "loss": 0.88798684, "num_input_tokens_seen": 93825670, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.578125, "step": 4345, "time_per_iteration": 2.3687942028045654 }, { "auxiliary_loss_clip": 0.01081716, "auxiliary_loss_mlp": 0.01035815, "balance_loss_clip": 1.02000141, "balance_loss_mlp": 1.02591062, "epoch": 0.2612956560949947, "flos": 21138791869440.0, "grad_norm": 1.8617342643171846, "language_loss": 0.76585996, "learning_rate": 3.3635207604716254e-06, "loss": 0.78703523, "num_input_tokens_seen": 93844045, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.55859375, "step": 4346, "time_per_iteration": 2.3935353755950928 }, { "auxiliary_loss_clip": 0.01081684, "auxiliary_loss_mlp": 0.01039103, "balance_loss_clip": 1.02169895, "balance_loss_mlp": 1.02359319, "epoch": 0.2613557793476627, "flos": 25117842988800.0, "grad_norm": 1.5974130175040966, "language_loss": 0.75614232, "learning_rate": 3.36324433056628e-06, "loss": 0.77735019, "num_input_tokens_seen": 93864380, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.58203125, "step": 4347, "time_per_iteration": 2.405407190322876 }, { "auxiliary_loss_clip": 0.01081437, "auxiliary_loss_mlp": 0.0103207, "balance_loss_clip": 1.01563072, "balance_loss_mlp": 1.02452922, "epoch": 0.26141590260033065, "flos": 26066502984960.0, "grad_norm": 2.738091999126562, "language_loss": 0.73471534, "learning_rate": 3.3629678520102517e-06, "loss": 0.75585037, "num_input_tokens_seen": 93885475, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.5703125, "step": 4348, "time_per_iteration": 2.451572895050049 }, { "auxiliary_loss_clip": 0.01084936, "auxiliary_loss_mlp": 0.01037315, "balance_loss_clip": 1.02075648, "balance_loss_mlp": 1.02558911, "epoch": 0.2614760258529987, "flos": 25700368890240.0, "grad_norm": 1.7207225096228225, "language_loss": 0.90501082, "learning_rate": 3.3626913248134065e-06, "loss": 0.92623335, "num_input_tokens_seen": 93905545, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.59375, "step": 4349, "time_per_iteration": 2.4037556648254395 }, { "auxiliary_loss_clip": 0.01081057, "auxiliary_loss_mlp": 0.01028907, "balance_loss_clip": 1.01292086, "balance_loss_mlp": 1.02502859, "epoch": 0.26153614910566664, "flos": 17456188037760.0, "grad_norm": 1.6986535503867581, "language_loss": 0.80059385, "learning_rate": 3.3624147489856134e-06, "loss": 0.82169342, "num_input_tokens_seen": 93924185, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.55859375, "step": 4350, "time_per_iteration": 2.3875317573547363 }, { "auxiliary_loss_clip": 0.0108049, "auxiliary_loss_mlp": 0.01033093, "balance_loss_clip": 1.01740551, "balance_loss_mlp": 1.02485681, "epoch": 0.2615962723583346, "flos": 17711856990720.0, "grad_norm": 1.8550050424244182, "language_loss": 0.62284708, "learning_rate": 3.3621381245367425e-06, "loss": 0.64398295, "num_input_tokens_seen": 93942825, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.5546875, "step": 4351, "time_per_iteration": 2.353825092315674 }, { "auxiliary_loss_clip": 0.0108404, "auxiliary_loss_mlp": 0.01032888, "balance_loss_clip": 1.01572168, "balance_loss_mlp": 1.02454305, "epoch": 0.26165639561100257, "flos": 23256623208960.0, "grad_norm": 1.7505439297140573, "language_loss": 0.83453608, "learning_rate": 3.361861451476665e-06, "loss": 0.85570538, "num_input_tokens_seen": 93962045, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.59375, "step": 4352, "time_per_iteration": 2.4056544303894043 }, { "auxiliary_loss_clip": 0.01019163, "auxiliary_loss_mlp": 0.01005607, "balance_loss_clip": 1.00368738, "balance_loss_mlp": 1.00525677, "epoch": 0.26171651886367053, "flos": 66734940026880.0, "grad_norm": 0.7962985566204221, "language_loss": 0.705495, "learning_rate": 3.361584729815256e-06, "loss": 0.7257427, "num_input_tokens_seen": 94021175, "router_z_loss_clip": 0.01916504, "router_z_loss_mlp": 0.13867188, "step": 4353, "time_per_iteration": 2.9259660243988037 }, { "auxiliary_loss_clip": 0.01081611, "auxiliary_loss_mlp": 0.01038839, "balance_loss_clip": 1.02209008, "balance_loss_mlp": 1.02363181, "epoch": 0.2617766421163385, "flos": 22348392433920.0, "grad_norm": 1.7339058908910299, "language_loss": 0.77563334, "learning_rate": 3.36130795956239e-06, "loss": 0.79683781, "num_input_tokens_seen": 94043370, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.58203125, "step": 4354, "time_per_iteration": 2.4646551609039307 }, { "auxiliary_loss_clip": 0.01086978, "auxiliary_loss_mlp": 0.01035421, "balance_loss_clip": 1.01889896, "balance_loss_mlp": 1.02577257, "epoch": 0.26183676536900646, "flos": 26065944403200.0, "grad_norm": 1.9438712587568305, "language_loss": 0.6831277, "learning_rate": 3.3610311407279456e-06, "loss": 0.70435178, "num_input_tokens_seen": 94063510, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.609375, "step": 4355, "time_per_iteration": 2.4147260189056396 }, { "auxiliary_loss_clip": 0.01082708, "auxiliary_loss_mlp": 0.01030286, "balance_loss_clip": 1.01249969, "balance_loss_mlp": 1.0246644, "epoch": 0.26189688862167443, "flos": 20995403448960.0, "grad_norm": 1.7696500051215371, "language_loss": 0.67444134, "learning_rate": 3.3607542733218002e-06, "loss": 0.69557124, "num_input_tokens_seen": 94083865, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.578125, "step": 4356, "time_per_iteration": 2.4343392848968506 }, { "auxiliary_loss_clip": 0.01017185, "auxiliary_loss_mlp": 0.01002721, "balance_loss_clip": 1.00067043, "balance_loss_mlp": 1.00336766, "epoch": 0.2619570118743424, "flos": 65795007870720.0, "grad_norm": 0.6852037391226983, "language_loss": 0.53176242, "learning_rate": 3.360477357353835e-06, "loss": 0.55196142, "num_input_tokens_seen": 94144095, "router_z_loss_clip": 0.02050781, "router_z_loss_mlp": 0.13867188, "step": 4357, "time_per_iteration": 2.9522459506988525 }, { "auxiliary_loss_clip": 0.01085013, "auxiliary_loss_mlp": 0.01035973, "balance_loss_clip": 1.0194509, "balance_loss_mlp": 1.02569139, "epoch": 0.26201713512701036, "flos": 28765568505600.0, "grad_norm": 1.9117433597624476, "language_loss": 0.83305454, "learning_rate": 3.3602003928339325e-06, "loss": 0.85426438, "num_input_tokens_seen": 94163035, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.59375, "step": 4358, "time_per_iteration": 2.458768367767334 }, { "auxiliary_loss_clip": 0.01086796, "auxiliary_loss_mlp": 0.01036543, "balance_loss_clip": 1.01835179, "balance_loss_mlp": 1.02564311, "epoch": 0.2620772583796783, "flos": 26431310448000.0, "grad_norm": 2.0582838763570064, "language_loss": 0.67504764, "learning_rate": 3.359923379771977e-06, "loss": 0.69628096, "num_input_tokens_seen": 94182520, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.609375, "step": 4359, "time_per_iteration": 3.8571369647979736 }, { "auxiliary_loss_clip": 0.0108345, "auxiliary_loss_mlp": 0.01031978, "balance_loss_clip": 1.0154438, "balance_loss_mlp": 1.02413988, "epoch": 0.2621373816323463, "flos": 20155532849280.0, "grad_norm": 2.1522994421529535, "language_loss": 0.78343588, "learning_rate": 3.359646318177854e-06, "loss": 0.80459011, "num_input_tokens_seen": 94201795, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.59375, "step": 4360, "time_per_iteration": 2.396752119064331 }, { "auxiliary_loss_clip": 0.01083539, "auxiliary_loss_mlp": 0.01034785, "balance_loss_clip": 1.0194782, "balance_loss_mlp": 1.02581263, "epoch": 0.26219750488501425, "flos": 28619980669440.0, "grad_norm": 1.685938352541504, "language_loss": 0.67987466, "learning_rate": 3.3593692080614515e-06, "loss": 0.70105791, "num_input_tokens_seen": 94222390, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.578125, "step": 4361, "time_per_iteration": 2.4369585514068604 }, { "auxiliary_loss_clip": 0.01084144, "auxiliary_loss_mlp": 0.01041632, "balance_loss_clip": 1.02305889, "balance_loss_mlp": 1.02448654, "epoch": 0.2622576281376823, "flos": 15041839587840.0, "grad_norm": 1.7258041002874784, "language_loss": 0.84457237, "learning_rate": 3.3590920494326585e-06, "loss": 0.86583012, "num_input_tokens_seen": 94239980, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.59765625, "step": 4362, "time_per_iteration": 3.718644618988037 }, { "auxiliary_loss_clip": 0.01083127, "auxiliary_loss_mlp": 0.01040111, "balance_loss_clip": 1.02224183, "balance_loss_mlp": 1.02676213, "epoch": 0.26231775139035024, "flos": 26394965856000.0, "grad_norm": 3.6317635192113453, "language_loss": 0.65254724, "learning_rate": 3.3588148423013665e-06, "loss": 0.67377967, "num_input_tokens_seen": 94260715, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.5625, "step": 4363, "time_per_iteration": 2.4228336811065674 }, { "auxiliary_loss_clip": 0.0101675, "auxiliary_loss_mlp": 0.01002206, "balance_loss_clip": 1.00038254, "balance_loss_mlp": 1.00312877, "epoch": 0.2623778746430182, "flos": 65405341653120.0, "grad_norm": 0.885519350546974, "language_loss": 0.61138755, "learning_rate": 3.3585375866774683e-06, "loss": 0.63157707, "num_input_tokens_seen": 94321285, "router_z_loss_clip": 0.01818848, "router_z_loss_mlp": 0.13671875, "step": 4364, "time_per_iteration": 3.109790325164795 }, { "auxiliary_loss_clip": 0.0108592, "auxiliary_loss_mlp": 0.01039824, "balance_loss_clip": 1.0216918, "balance_loss_mlp": 1.02690482, "epoch": 0.26243799789568617, "flos": 12603400433280.0, "grad_norm": 2.486995179763905, "language_loss": 0.71691263, "learning_rate": 3.3582602825708577e-06, "loss": 0.73817003, "num_input_tokens_seen": 94335420, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.58984375, "step": 4365, "time_per_iteration": 3.7059733867645264 }, { "auxiliary_loss_clip": 0.01084926, "auxiliary_loss_mlp": 0.01034484, "balance_loss_clip": 1.01810503, "balance_loss_mlp": 1.02658844, "epoch": 0.26249812114835414, "flos": 28622494287360.0, "grad_norm": 1.5865053590280984, "language_loss": 0.77013832, "learning_rate": 3.3579829299914314e-06, "loss": 0.79133248, "num_input_tokens_seen": 94357440, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.5859375, "step": 4366, "time_per_iteration": 2.4810895919799805 }, { "auxiliary_loss_clip": 0.01086018, "auxiliary_loss_mlp": 0.01037542, "balance_loss_clip": 1.02080488, "balance_loss_mlp": 1.0271523, "epoch": 0.2625582444010221, "flos": 14464515479040.0, "grad_norm": 2.1766757845115405, "language_loss": 0.75699329, "learning_rate": 3.3577055289490875e-06, "loss": 0.77822882, "num_input_tokens_seen": 94375690, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.58984375, "step": 4367, "time_per_iteration": 2.3700788021087646 }, { "auxiliary_loss_clip": 0.01080943, "auxiliary_loss_mlp": 0.01030906, "balance_loss_clip": 1.01578438, "balance_loss_mlp": 1.02506244, "epoch": 0.26261836765369007, "flos": 16612372454400.0, "grad_norm": 1.5430765527569217, "language_loss": 0.6939097, "learning_rate": 3.357428079453726e-06, "loss": 0.71502817, "num_input_tokens_seen": 94393190, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.55859375, "step": 4368, "time_per_iteration": 2.3812553882598877 }, { "auxiliary_loss_clip": 0.01081122, "auxiliary_loss_mlp": 0.01036805, "balance_loss_clip": 1.02010322, "balance_loss_mlp": 1.02419424, "epoch": 0.26267849090635803, "flos": 20518943857920.0, "grad_norm": 1.9212666150589235, "language_loss": 0.78736794, "learning_rate": 3.357150581515248e-06, "loss": 0.8085472, "num_input_tokens_seen": 94410975, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.5703125, "step": 4369, "time_per_iteration": 3.818350076675415 }, { "auxiliary_loss_clip": 0.01082732, "auxiliary_loss_mlp": 0.01034902, "balance_loss_clip": 1.01781893, "balance_loss_mlp": 1.02509856, "epoch": 0.262738614159026, "flos": 21322888801920.0, "grad_norm": 1.8764586658406883, "language_loss": 0.83259284, "learning_rate": 3.3568730351435565e-06, "loss": 0.85376918, "num_input_tokens_seen": 94429985, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.578125, "step": 4370, "time_per_iteration": 2.4190120697021484 }, { "auxiliary_loss_clip": 0.01086615, "auxiliary_loss_mlp": 0.01042301, "balance_loss_clip": 1.02450299, "balance_loss_mlp": 1.02636886, "epoch": 0.26279873741169396, "flos": 17602613746560.0, "grad_norm": 1.9960765006733125, "language_loss": 0.71356869, "learning_rate": 3.356595440348557e-06, "loss": 0.73485786, "num_input_tokens_seen": 94448660, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.6015625, "step": 4371, "time_per_iteration": 2.3624267578125 }, { "auxiliary_loss_clip": 0.01016725, "auxiliary_loss_mlp": 0.01008475, "balance_loss_clip": 1.00663948, "balance_loss_mlp": 1.00319839, "epoch": 0.2628588606643619, "flos": 60946514363520.0, "grad_norm": 0.6903965674816097, "language_loss": 0.56374061, "learning_rate": 3.356317797140156e-06, "loss": 0.5839926, "num_input_tokens_seen": 94515630, "router_z_loss_clip": 0.01831055, "router_z_loss_mlp": 0.13476562, "step": 4372, "time_per_iteration": 3.1685798168182373 }, { "auxiliary_loss_clip": 0.01078309, "auxiliary_loss_mlp": 0.01032138, "balance_loss_clip": 1.01613426, "balance_loss_mlp": 1.02404928, "epoch": 0.2629189839170299, "flos": 27015093158400.0, "grad_norm": 1.568344460884402, "language_loss": 0.77413881, "learning_rate": 3.3560401055282617e-06, "loss": 0.79524326, "num_input_tokens_seen": 94535385, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5390625, "step": 4373, "time_per_iteration": 2.413288116455078 }, { "auxiliary_loss_clip": 0.01083175, "auxiliary_loss_mlp": 0.01033638, "balance_loss_clip": 1.01822376, "balance_loss_mlp": 1.02676439, "epoch": 0.26297910716969786, "flos": 17018900858880.0, "grad_norm": 2.2738660561378423, "language_loss": 0.71958148, "learning_rate": 3.3557623655227835e-06, "loss": 0.7407496, "num_input_tokens_seen": 94552650, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.5625, "step": 4374, "time_per_iteration": 2.3885531425476074 }, { "auxiliary_loss_clip": 0.01083907, "auxiliary_loss_mlp": 0.01037403, "balance_loss_clip": 1.01941419, "balance_loss_mlp": 1.02640259, "epoch": 0.2630392304223659, "flos": 24896284300800.0, "grad_norm": 1.9842920010001697, "language_loss": 0.80709791, "learning_rate": 3.355484577133634e-06, "loss": 0.82831097, "num_input_tokens_seen": 94574075, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.57421875, "step": 4375, "time_per_iteration": 2.4129323959350586 }, { "auxiliary_loss_clip": 0.01080039, "auxiliary_loss_mlp": 0.01030826, "balance_loss_clip": 1.0147804, "balance_loss_mlp": 1.02443516, "epoch": 0.26309935367503384, "flos": 32852640971520.0, "grad_norm": 1.8311917808216, "language_loss": 0.66401243, "learning_rate": 3.3552067403707272e-06, "loss": 0.68512112, "num_input_tokens_seen": 94594255, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5546875, "step": 4376, "time_per_iteration": 2.492330551147461 }, { "auxiliary_loss_clip": 0.01082875, "auxiliary_loss_mlp": 0.01034176, "balance_loss_clip": 1.01810658, "balance_loss_mlp": 1.0257802, "epoch": 0.2631594769277018, "flos": 15887051625600.0, "grad_norm": 2.1582891324182305, "language_loss": 0.69157761, "learning_rate": 3.3549288552439777e-06, "loss": 0.71274817, "num_input_tokens_seen": 94611410, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5703125, "step": 4377, "time_per_iteration": 2.3599071502685547 }, { "auxiliary_loss_clip": 0.01082518, "auxiliary_loss_mlp": 0.01030591, "balance_loss_clip": 1.01411593, "balance_loss_mlp": 1.02396512, "epoch": 0.2632196001803698, "flos": 50803060348800.0, "grad_norm": 1.7938992099265185, "language_loss": 0.79095978, "learning_rate": 3.3546509217633025e-06, "loss": 0.81209087, "num_input_tokens_seen": 94636575, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.5859375, "step": 4378, "time_per_iteration": 2.6629154682159424 }, { "auxiliary_loss_clip": 0.01081721, "auxiliary_loss_mlp": 0.01036983, "balance_loss_clip": 1.02221322, "balance_loss_mlp": 1.02553117, "epoch": 0.26327972343303774, "flos": 13732247289600.0, "grad_norm": 2.2174823951933673, "language_loss": 0.76924193, "learning_rate": 3.3543729399386207e-06, "loss": 0.790429, "num_input_tokens_seen": 94654345, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.5625, "step": 4379, "time_per_iteration": 2.3615598678588867 }, { "auxiliary_loss_clip": 0.01084724, "auxiliary_loss_mlp": 0.01038309, "balance_loss_clip": 1.01914012, "balance_loss_mlp": 1.02540565, "epoch": 0.2633398466857057, "flos": 23767926203520.0, "grad_norm": 6.362286528594949, "language_loss": 0.77420974, "learning_rate": 3.354094909779852e-06, "loss": 0.79544008, "num_input_tokens_seen": 94673985, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.59375, "step": 4380, "time_per_iteration": 2.4259772300720215 }, { "auxiliary_loss_clip": 0.01083306, "auxiliary_loss_mlp": 0.01031373, "balance_loss_clip": 1.0150708, "balance_loss_mlp": 1.02437663, "epoch": 0.26339996993837367, "flos": 27598980602880.0, "grad_norm": 1.725418056129015, "language_loss": 0.63635713, "learning_rate": 3.353816831296919e-06, "loss": 0.6575039, "num_input_tokens_seen": 94693145, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.58984375, "step": 4381, "time_per_iteration": 2.430692434310913 }, { "auxiliary_loss_clip": 0.01081333, "auxiliary_loss_mlp": 0.01030871, "balance_loss_clip": 1.01564837, "balance_loss_mlp": 1.0243516, "epoch": 0.26346009319104163, "flos": 16945373802240.0, "grad_norm": 1.7225451930022155, "language_loss": 0.82855475, "learning_rate": 3.353538704499747e-06, "loss": 0.84967685, "num_input_tokens_seen": 94710185, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.5703125, "step": 4382, "time_per_iteration": 2.3888862133026123 }, { "auxiliary_loss_clip": 0.01088813, "auxiliary_loss_mlp": 0.01039227, "balance_loss_clip": 1.02026033, "balance_loss_mlp": 1.02641439, "epoch": 0.2635202164437096, "flos": 37230714552960.0, "grad_norm": 1.8838636120225258, "language_loss": 0.70196539, "learning_rate": 3.3532605293982592e-06, "loss": 0.72324574, "num_input_tokens_seen": 94730280, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.625, "step": 4383, "time_per_iteration": 2.5154879093170166 }, { "auxiliary_loss_clip": 0.0108451, "auxiliary_loss_mlp": 0.01033798, "balance_loss_clip": 1.01813376, "balance_loss_mlp": 1.02588332, "epoch": 0.26358033969637756, "flos": 20995298714880.0, "grad_norm": 1.6678512838406812, "language_loss": 0.69217885, "learning_rate": 3.3529823060023847e-06, "loss": 0.71336192, "num_input_tokens_seen": 94748560, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.5859375, "step": 4384, "time_per_iteration": 2.4252703189849854 }, { "auxiliary_loss_clip": 0.01081172, "auxiliary_loss_mlp": 0.01034558, "balance_loss_clip": 1.01836967, "balance_loss_mlp": 1.02483261, "epoch": 0.26364046294904553, "flos": 27744847729920.0, "grad_norm": 1.9567158780207452, "language_loss": 0.70250678, "learning_rate": 3.352704034322052e-06, "loss": 0.72366405, "num_input_tokens_seen": 94767570, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.5625, "step": 4385, "time_per_iteration": 2.419276237487793 }, { "auxiliary_loss_clip": 0.01085956, "auxiliary_loss_mlp": 0.01034463, "balance_loss_clip": 1.0178926, "balance_loss_mlp": 1.028512, "epoch": 0.2637005862017135, "flos": 22891990302720.0, "grad_norm": 2.8594680097210334, "language_loss": 0.85318404, "learning_rate": 3.352425714367191e-06, "loss": 0.87438822, "num_input_tokens_seen": 94784985, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.57421875, "step": 4386, "time_per_iteration": 2.4315130710601807 }, { "auxiliary_loss_clip": 0.01085615, "auxiliary_loss_mlp": 0.01042264, "balance_loss_clip": 1.02513361, "balance_loss_mlp": 1.0259155, "epoch": 0.26376070945438146, "flos": 15047949075840.0, "grad_norm": 3.1188781365018405, "language_loss": 0.77222967, "learning_rate": 3.352147346147736e-06, "loss": 0.79350853, "num_input_tokens_seen": 94802545, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.59765625, "step": 4387, "time_per_iteration": 2.3469078540802 }, { "auxiliary_loss_clip": 0.01084967, "auxiliary_loss_mlp": 0.01040062, "balance_loss_clip": 1.02309895, "balance_loss_mlp": 1.02795732, "epoch": 0.2638208327070494, "flos": 21140781816960.0, "grad_norm": 1.8367250148165088, "language_loss": 0.75953889, "learning_rate": 3.35186892967362e-06, "loss": 0.78078914, "num_input_tokens_seen": 94820730, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.5703125, "step": 4388, "time_per_iteration": 2.39621639251709 }, { "auxiliary_loss_clip": 0.01081453, "auxiliary_loss_mlp": 0.01031921, "balance_loss_clip": 1.01557755, "balance_loss_mlp": 1.0245564, "epoch": 0.26388095595971744, "flos": 21724529616000.0, "grad_norm": 2.0267249034861945, "language_loss": 0.86646807, "learning_rate": 3.3515904649547797e-06, "loss": 0.88760179, "num_input_tokens_seen": 94839175, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.5703125, "step": 4389, "time_per_iteration": 2.369663715362549 }, { "auxiliary_loss_clip": 0.01015645, "auxiliary_loss_mlp": 0.01003079, "balance_loss_clip": 1.00141013, "balance_loss_mlp": 1.00241423, "epoch": 0.2639410792123854, "flos": 65512036368000.0, "grad_norm": 0.8054941886791257, "language_loss": 0.60376012, "learning_rate": 3.351311952001152e-06, "loss": 0.62394738, "num_input_tokens_seen": 94898865, "router_z_loss_clip": 0.01672363, "router_z_loss_mlp": 0.1328125, "step": 4390, "time_per_iteration": 3.0563459396362305 }, { "auxiliary_loss_clip": 0.01083623, "auxiliary_loss_mlp": 0.01034569, "balance_loss_clip": 1.0171051, "balance_loss_mlp": 1.02455616, "epoch": 0.2640012024650534, "flos": 23947519570560.0, "grad_norm": 1.6027148621301397, "language_loss": 0.77809501, "learning_rate": 3.3510333908226765e-06, "loss": 0.79927695, "num_input_tokens_seen": 94917490, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.58984375, "step": 4391, "time_per_iteration": 2.4106431007385254 }, { "auxiliary_loss_clip": 0.01014529, "auxiliary_loss_mlp": 0.01001722, "balance_loss_clip": 1.00002944, "balance_loss_mlp": 1.00144744, "epoch": 0.26406132571772134, "flos": 56437620451200.0, "grad_norm": 0.8326141864888711, "language_loss": 0.58650523, "learning_rate": 3.3507547814292953e-06, "loss": 0.60666776, "num_input_tokens_seen": 94969065, "router_z_loss_clip": 0.01696777, "router_z_loss_mlp": 0.13085938, "step": 4392, "time_per_iteration": 3.031676769256592 }, { "auxiliary_loss_clip": 0.0108808, "auxiliary_loss_mlp": 0.01032876, "balance_loss_clip": 1.01624668, "balance_loss_mlp": 1.02769303, "epoch": 0.2641214489703893, "flos": 22089476724480.0, "grad_norm": 1.7174289514152088, "language_loss": 0.68520582, "learning_rate": 3.35047612383095e-06, "loss": 0.70641536, "num_input_tokens_seen": 94988540, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.6015625, "step": 4393, "time_per_iteration": 2.3734850883483887 }, { "auxiliary_loss_clip": 0.01086166, "auxiliary_loss_mlp": 0.01036641, "balance_loss_clip": 1.01744819, "balance_loss_mlp": 1.02461278, "epoch": 0.26418157222305727, "flos": 16543837722240.0, "grad_norm": 1.8403628843433328, "language_loss": 0.83997947, "learning_rate": 3.3501974180375857e-06, "loss": 0.86120754, "num_input_tokens_seen": 95004810, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.6171875, "step": 4394, "time_per_iteration": 2.378603458404541 }, { "auxiliary_loss_clip": 0.01089048, "auxiliary_loss_mlp": 0.01038934, "balance_loss_clip": 1.01870465, "balance_loss_mlp": 1.0268085, "epoch": 0.26424169547572524, "flos": 18001566385920.0, "grad_norm": 1.9561862411264384, "language_loss": 0.70409507, "learning_rate": 3.349918664059149e-06, "loss": 0.72537494, "num_input_tokens_seen": 95024085, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.625, "step": 4395, "time_per_iteration": 2.3475661277770996 }, { "auxiliary_loss_clip": 0.01083131, "auxiliary_loss_mlp": 0.01030646, "balance_loss_clip": 1.01376581, "balance_loss_mlp": 1.02572632, "epoch": 0.2643018187283932, "flos": 16982207153280.0, "grad_norm": 5.891390896581395, "language_loss": 0.86426324, "learning_rate": 3.3496398619055876e-06, "loss": 0.88540101, "num_input_tokens_seen": 95042515, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.57421875, "step": 4396, "time_per_iteration": 2.4067912101745605 }, { "auxiliary_loss_clip": 0.01016641, "auxiliary_loss_mlp": 0.01002237, "balance_loss_clip": 1.00038898, "balance_loss_mlp": 1.00320745, "epoch": 0.26436194198106117, "flos": 59661396794880.0, "grad_norm": 0.7841854801878994, "language_loss": 0.5502708, "learning_rate": 3.3493610115868505e-06, "loss": 0.5704596, "num_input_tokens_seen": 95094835, "router_z_loss_clip": 0.01843262, "router_z_loss_mlp": 0.13476562, "step": 4397, "time_per_iteration": 2.792224884033203 }, { "auxiliary_loss_clip": 0.01084127, "auxiliary_loss_mlp": 0.01038971, "balance_loss_clip": 1.02174485, "balance_loss_mlp": 1.02656198, "epoch": 0.26442206523372913, "flos": 32920093451520.0, "grad_norm": 2.3186804286987184, "language_loss": 0.78261548, "learning_rate": 3.3490821131128905e-06, "loss": 0.80384642, "num_input_tokens_seen": 95113480, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.578125, "step": 4398, "time_per_iteration": 2.5096499919891357 }, { "auxiliary_loss_clip": 0.01089577, "auxiliary_loss_mlp": 0.01036791, "balance_loss_clip": 1.01837349, "balance_loss_mlp": 1.02938032, "epoch": 0.2644821884863971, "flos": 21030281763840.0, "grad_norm": 1.6897059478807268, "language_loss": 0.67123854, "learning_rate": 3.34880316649366e-06, "loss": 0.6925022, "num_input_tokens_seen": 95132580, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.6015625, "step": 4399, "time_per_iteration": 3.792250633239746 }, { "auxiliary_loss_clip": 0.01078808, "auxiliary_loss_mlp": 0.01031263, "balance_loss_clip": 1.01645744, "balance_loss_mlp": 1.02631593, "epoch": 0.26454231173906506, "flos": 20775764885760.0, "grad_norm": 1.8342268768383256, "language_loss": 0.86510193, "learning_rate": 3.3485241717391137e-06, "loss": 0.88620263, "num_input_tokens_seen": 95152375, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.5234375, "step": 4400, "time_per_iteration": 2.416428327560425 }, { "auxiliary_loss_clip": 0.01087089, "auxiliary_loss_mlp": 0.01033508, "balance_loss_clip": 1.01489949, "balance_loss_mlp": 1.02763569, "epoch": 0.264602434991733, "flos": 16617713892480.0, "grad_norm": 1.8259711791839561, "language_loss": 0.75744885, "learning_rate": 3.348245128859209e-06, "loss": 0.77865481, "num_input_tokens_seen": 95170265, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.59375, "step": 4401, "time_per_iteration": 2.401073932647705 }, { "auxiliary_loss_clip": 0.01088378, "auxiliary_loss_mlp": 0.01041038, "balance_loss_clip": 1.02122557, "balance_loss_mlp": 1.02647591, "epoch": 0.26466255824440105, "flos": 19061669041920.0, "grad_norm": 1.7328212095312157, "language_loss": 0.88417149, "learning_rate": 3.3479660378639036e-06, "loss": 0.9054656, "num_input_tokens_seen": 95188655, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.62109375, "step": 4402, "time_per_iteration": 3.7659802436828613 }, { "auxiliary_loss_clip": 0.01084767, "auxiliary_loss_mlp": 0.01031153, "balance_loss_clip": 1.01411796, "balance_loss_mlp": 1.02577353, "epoch": 0.264722681497069, "flos": 22637438513280.0, "grad_norm": 1.7664811494722308, "language_loss": 0.78145206, "learning_rate": 3.3476868987631575e-06, "loss": 0.80261123, "num_input_tokens_seen": 95209615, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.58984375, "step": 4403, "time_per_iteration": 2.400317430496216 }, { "auxiliary_loss_clip": 0.01084942, "auxiliary_loss_mlp": 0.01034145, "balance_loss_clip": 1.01712227, "balance_loss_mlp": 1.02524889, "epoch": 0.264782804749737, "flos": 22491152449920.0, "grad_norm": 1.8419357354733938, "language_loss": 0.88121796, "learning_rate": 3.3474077115669327e-06, "loss": 0.90240884, "num_input_tokens_seen": 95227810, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.59765625, "step": 4404, "time_per_iteration": 2.41160249710083 }, { "auxiliary_loss_clip": 0.01084569, "auxiliary_loss_mlp": 0.01033044, "balance_loss_clip": 1.01737428, "balance_loss_mlp": 1.02517438, "epoch": 0.26484292800240494, "flos": 16799332118400.0, "grad_norm": 1.6810498335282373, "language_loss": 0.76011705, "learning_rate": 3.347128476285193e-06, "loss": 0.78129321, "num_input_tokens_seen": 95245890, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.59375, "step": 4405, "time_per_iteration": 3.7762291431427 }, { "auxiliary_loss_clip": 0.0108679, "auxiliary_loss_mlp": 0.01033609, "balance_loss_clip": 1.01527512, "balance_loss_mlp": 1.02725601, "epoch": 0.2649030512550729, "flos": 20448523912320.0, "grad_norm": 1.7145958692588852, "language_loss": 0.7003299, "learning_rate": 3.346849192927903e-06, "loss": 0.72153401, "num_input_tokens_seen": 95264955, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.59375, "step": 4406, "time_per_iteration": 2.4083986282348633 }, { "auxiliary_loss_clip": 0.01083162, "auxiliary_loss_mlp": 0.01034772, "balance_loss_clip": 1.01766515, "balance_loss_mlp": 1.02575684, "epoch": 0.2649631745077409, "flos": 22415111775360.0, "grad_norm": 1.6771126599034714, "language_loss": 0.83475494, "learning_rate": 3.3465698615050295e-06, "loss": 0.85593432, "num_input_tokens_seen": 95284245, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.57421875, "step": 4407, "time_per_iteration": 2.385935068130493 }, { "auxiliary_loss_clip": 0.01082729, "auxiliary_loss_mlp": 0.0102705, "balance_loss_clip": 1.01056385, "balance_loss_mlp": 1.02529013, "epoch": 0.26502329776040884, "flos": 35114663692800.0, "grad_norm": 1.8707253853818477, "language_loss": 0.75918061, "learning_rate": 3.346290482026542e-06, "loss": 0.78027844, "num_input_tokens_seen": 95307125, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.57421875, "step": 4408, "time_per_iteration": 3.924562931060791 }, { "auxiliary_loss_clip": 0.01081612, "auxiliary_loss_mlp": 0.01034148, "balance_loss_clip": 1.01716042, "balance_loss_mlp": 1.02506995, "epoch": 0.2650834210130768, "flos": 38686069244160.0, "grad_norm": 1.7151014031569332, "language_loss": 0.71050882, "learning_rate": 3.3460110545024094e-06, "loss": 0.73166645, "num_input_tokens_seen": 95329150, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.56640625, "step": 4409, "time_per_iteration": 2.530848979949951 }, { "auxiliary_loss_clip": 0.01085514, "auxiliary_loss_mlp": 0.01038395, "balance_loss_clip": 1.02051401, "balance_loss_mlp": 1.02572131, "epoch": 0.26514354426574477, "flos": 24715713415680.0, "grad_norm": 1.9588266821955322, "language_loss": 0.73921341, "learning_rate": 3.3457315789426054e-06, "loss": 0.76045251, "num_input_tokens_seen": 95349880, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.59765625, "step": 4410, "time_per_iteration": 2.467231273651123 }, { "auxiliary_loss_clip": 0.0109092, "auxiliary_loss_mlp": 0.01036425, "balance_loss_clip": 1.01836467, "balance_loss_mlp": 1.0285635, "epoch": 0.26520366751841273, "flos": 20339001377280.0, "grad_norm": 1.8507813084534541, "language_loss": 0.73387146, "learning_rate": 3.345452055357103e-06, "loss": 0.75514489, "num_input_tokens_seen": 95368570, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.625, "step": 4411, "time_per_iteration": 2.424858331680298 }, { "auxiliary_loss_clip": 0.01084199, "auxiliary_loss_mlp": 0.01036434, "balance_loss_clip": 1.0188024, "balance_loss_mlp": 1.02611351, "epoch": 0.2652637907710807, "flos": 22342841527680.0, "grad_norm": 2.0768969364102667, "language_loss": 0.81789207, "learning_rate": 3.345172483755878e-06, "loss": 0.83909839, "num_input_tokens_seen": 95387065, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.578125, "step": 4412, "time_per_iteration": 2.4285106658935547 }, { "auxiliary_loss_clip": 0.01085009, "auxiliary_loss_mlp": 0.01037716, "balance_loss_clip": 1.02158701, "balance_loss_mlp": 1.02665496, "epoch": 0.26532391402374866, "flos": 19353228739200.0, "grad_norm": 2.04123985777002, "language_loss": 0.74686402, "learning_rate": 3.3448928641489057e-06, "loss": 0.76809126, "num_input_tokens_seen": 95406345, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.5859375, "step": 4413, "time_per_iteration": 2.3657784461975098 }, { "auxiliary_loss_clip": 0.01016001, "auxiliary_loss_mlp": 0.01002366, "balance_loss_clip": 1.00072134, "balance_loss_mlp": 1.00263, "epoch": 0.26538403727641663, "flos": 44784800138880.0, "grad_norm": 0.8627612956571837, "language_loss": 0.5696466, "learning_rate": 3.344613196546168e-06, "loss": 0.58983028, "num_input_tokens_seen": 95463595, "router_z_loss_clip": 0.01647949, "router_z_loss_mlp": 0.1328125, "step": 4414, "time_per_iteration": 2.9951398372650146 }, { "auxiliary_loss_clip": 0.01080325, "auxiliary_loss_mlp": 0.01033703, "balance_loss_clip": 1.01805067, "balance_loss_mlp": 1.02451253, "epoch": 0.26544416052908465, "flos": 28180913011200.0, "grad_norm": 1.7867920966863948, "language_loss": 0.74504185, "learning_rate": 3.3443334809576434e-06, "loss": 0.76618218, "num_input_tokens_seen": 95484115, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.55859375, "step": 4415, "time_per_iteration": 2.4403231143951416 }, { "auxiliary_loss_clip": 0.01086161, "auxiliary_loss_mlp": 0.01034844, "balance_loss_clip": 1.01561606, "balance_loss_mlp": 1.0252192, "epoch": 0.2655042837817526, "flos": 17564349029760.0, "grad_norm": 2.116248324309045, "language_loss": 0.86852682, "learning_rate": 3.344053717393315e-06, "loss": 0.88973689, "num_input_tokens_seen": 95501435, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.609375, "step": 4416, "time_per_iteration": 2.389538526535034 }, { "auxiliary_loss_clip": 0.01087798, "auxiliary_loss_mlp": 0.01033258, "balance_loss_clip": 1.01535249, "balance_loss_mlp": 1.02786207, "epoch": 0.2655644070344206, "flos": 23403502765440.0, "grad_norm": 1.6650351412956075, "language_loss": 0.76366687, "learning_rate": 3.343773905863167e-06, "loss": 0.78487742, "num_input_tokens_seen": 95520135, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.59765625, "step": 4417, "time_per_iteration": 2.390695333480835 }, { "auxiliary_loss_clip": 0.01082511, "auxiliary_loss_mlp": 0.01034343, "balance_loss_clip": 1.01598489, "balance_loss_mlp": 1.02553582, "epoch": 0.26562453028708854, "flos": 26467271015040.0, "grad_norm": 1.6762372542929278, "language_loss": 0.79955584, "learning_rate": 3.3434940463771847e-06, "loss": 0.82072443, "num_input_tokens_seen": 95541705, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.5703125, "step": 4418, "time_per_iteration": 2.4588255882263184 }, { "auxiliary_loss_clip": 0.01086041, "auxiliary_loss_mlp": 0.0103504, "balance_loss_clip": 1.01663399, "balance_loss_mlp": 1.02636266, "epoch": 0.2656846535397565, "flos": 19206593562240.0, "grad_norm": 3.1997615779180197, "language_loss": 0.67047536, "learning_rate": 3.343214138945356e-06, "loss": 0.69168615, "num_input_tokens_seen": 95560300, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.59765625, "step": 4419, "time_per_iteration": 2.3462040424346924 }, { "auxiliary_loss_clip": 0.0108648, "auxiliary_loss_mlp": 0.01037498, "balance_loss_clip": 1.01873422, "balance_loss_mlp": 1.02627826, "epoch": 0.2657447767924245, "flos": 30550119206400.0, "grad_norm": 1.6345564071009095, "language_loss": 0.79285121, "learning_rate": 3.3429341835776695e-06, "loss": 0.81409103, "num_input_tokens_seen": 95580150, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.6015625, "step": 4420, "time_per_iteration": 2.4733612537384033 }, { "auxiliary_loss_clip": 0.01087562, "auxiliary_loss_mlp": 0.01038186, "balance_loss_clip": 1.01818299, "balance_loss_mlp": 1.02625501, "epoch": 0.26580490004509244, "flos": 20921701835520.0, "grad_norm": 1.8012763149988291, "language_loss": 0.81564724, "learning_rate": 3.342654180284117e-06, "loss": 0.83690476, "num_input_tokens_seen": 95597570, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.61328125, "step": 4421, "time_per_iteration": 2.3722877502441406 }, { "auxiliary_loss_clip": 0.0108215, "auxiliary_loss_mlp": 0.01031909, "balance_loss_clip": 1.01545858, "balance_loss_mlp": 1.02542126, "epoch": 0.2658650232977604, "flos": 43943988798720.0, "grad_norm": 1.6239842243624345, "language_loss": 0.6596427, "learning_rate": 3.3423741290746897e-06, "loss": 0.68078327, "num_input_tokens_seen": 95619415, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.56640625, "step": 4422, "time_per_iteration": 2.5988452434539795 }, { "auxiliary_loss_clip": 0.01084078, "auxiliary_loss_mlp": 0.01034599, "balance_loss_clip": 1.0167774, "balance_loss_mlp": 1.02452087, "epoch": 0.26592514655042837, "flos": 29715136197120.0, "grad_norm": 2.050396479599726, "language_loss": 0.73857653, "learning_rate": 3.342094029959383e-06, "loss": 0.75976324, "num_input_tokens_seen": 95639155, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.59765625, "step": 4423, "time_per_iteration": 2.4294188022613525 }, { "auxiliary_loss_clip": 0.01081754, "auxiliary_loss_mlp": 0.01039642, "balance_loss_clip": 1.02213025, "balance_loss_mlp": 1.02350807, "epoch": 0.26598526980309634, "flos": 46676082332160.0, "grad_norm": 1.6057609936293193, "language_loss": 0.77617615, "learning_rate": 3.341813882948193e-06, "loss": 0.7973901, "num_input_tokens_seen": 95663320, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.58203125, "step": 4424, "time_per_iteration": 2.6134867668151855 }, { "auxiliary_loss_clip": 0.0108458, "auxiliary_loss_mlp": 0.01038497, "balance_loss_clip": 1.02152133, "balance_loss_mlp": 1.02587628, "epoch": 0.2660453930557643, "flos": 11508663841920.0, "grad_norm": 1.9080086544576587, "language_loss": 0.78946781, "learning_rate": 3.341533688051117e-06, "loss": 0.81069863, "num_input_tokens_seen": 95680260, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.5859375, "step": 4425, "time_per_iteration": 2.344456195831299 }, { "auxiliary_loss_clip": 0.01083499, "auxiliary_loss_mlp": 0.01036544, "balance_loss_clip": 1.02078414, "balance_loss_mlp": 1.02756691, "epoch": 0.26610551630843227, "flos": 24790392547200.0, "grad_norm": 2.648059298186932, "language_loss": 0.8029412, "learning_rate": 3.3412534452781543e-06, "loss": 0.82414162, "num_input_tokens_seen": 95701140, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.55859375, "step": 4426, "time_per_iteration": 2.4670002460479736 }, { "auxiliary_loss_clip": 0.01087237, "auxiliary_loss_mlp": 0.01035856, "balance_loss_clip": 1.01826704, "balance_loss_mlp": 1.02924085, "epoch": 0.26616563956110023, "flos": 27635150638080.0, "grad_norm": 1.7151093890687164, "language_loss": 0.76837152, "learning_rate": 3.3409731546393067e-06, "loss": 0.78960252, "num_input_tokens_seen": 95722060, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.578125, "step": 4427, "time_per_iteration": 2.4607059955596924 }, { "auxiliary_loss_clip": 0.01079872, "auxiliary_loss_mlp": 0.01028893, "balance_loss_clip": 1.01305056, "balance_loss_mlp": 1.02486455, "epoch": 0.26622576281376825, "flos": 28361728275840.0, "grad_norm": 1.4907260181840092, "language_loss": 0.76644647, "learning_rate": 3.3406928161445756e-06, "loss": 0.78753412, "num_input_tokens_seen": 95742495, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.546875, "step": 4428, "time_per_iteration": 2.4728152751922607 }, { "auxiliary_loss_clip": 0.01084616, "auxiliary_loss_mlp": 0.01028649, "balance_loss_clip": 1.01249576, "balance_loss_mlp": 1.02543139, "epoch": 0.2662858860664362, "flos": 18040354773120.0, "grad_norm": 2.0009360026637766, "language_loss": 0.82547939, "learning_rate": 3.340412429803967e-06, "loss": 0.84661198, "num_input_tokens_seen": 95761510, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.58984375, "step": 4429, "time_per_iteration": 2.355893135070801 }, { "auxiliary_loss_clip": 0.01081085, "auxiliary_loss_mlp": 0.0103807, "balance_loss_clip": 1.02099943, "balance_loss_mlp": 1.02443218, "epoch": 0.2663460093191042, "flos": 22744761632640.0, "grad_norm": 1.8525218053636021, "language_loss": 0.72379601, "learning_rate": 3.3401319956274872e-06, "loss": 0.74498761, "num_input_tokens_seen": 95782385, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.56640625, "step": 4430, "time_per_iteration": 2.426131010055542 }, { "auxiliary_loss_clip": 0.01086066, "auxiliary_loss_mlp": 0.01040186, "balance_loss_clip": 1.02156508, "balance_loss_mlp": 1.02631724, "epoch": 0.26640613257177215, "flos": 16507842243840.0, "grad_norm": 4.51152311135043, "language_loss": 0.81984192, "learning_rate": 3.3398515136251435e-06, "loss": 0.84110445, "num_input_tokens_seen": 95800595, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.59765625, "step": 4431, "time_per_iteration": 2.3608851432800293 }, { "auxiliary_loss_clip": 0.01089014, "auxiliary_loss_mlp": 0.01042729, "balance_loss_clip": 1.02395391, "balance_loss_mlp": 1.02706099, "epoch": 0.2664662558244401, "flos": 23074830426240.0, "grad_norm": 2.1298469512924907, "language_loss": 0.7598294, "learning_rate": 3.3395709838069463e-06, "loss": 0.78114682, "num_input_tokens_seen": 95818480, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.6171875, "step": 4432, "time_per_iteration": 2.405139446258545 }, { "auxiliary_loss_clip": 0.01081227, "auxiliary_loss_mlp": 0.01031299, "balance_loss_clip": 1.01426446, "balance_loss_mlp": 1.02397203, "epoch": 0.2665263790771081, "flos": 23768135671680.0, "grad_norm": 1.8088381619840193, "language_loss": 0.82675636, "learning_rate": 3.3392904061829054e-06, "loss": 0.84788167, "num_input_tokens_seen": 95837205, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.57421875, "step": 4433, "time_per_iteration": 2.3934218883514404 }, { "auxiliary_loss_clip": 0.01082703, "auxiliary_loss_mlp": 0.01040388, "balance_loss_clip": 1.02269769, "balance_loss_mlp": 1.02573907, "epoch": 0.26658650232977604, "flos": 28000027923840.0, "grad_norm": 2.320778181328883, "language_loss": 0.76531565, "learning_rate": 3.3390097807630353e-06, "loss": 0.78654659, "num_input_tokens_seen": 95858395, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.5703125, "step": 4434, "time_per_iteration": 2.459339141845703 }, { "auxiliary_loss_clip": 0.01083463, "auxiliary_loss_mlp": 0.01033409, "balance_loss_clip": 1.01747119, "balance_loss_mlp": 1.02591419, "epoch": 0.266646625582444, "flos": 22162549933440.0, "grad_norm": 2.067932987258131, "language_loss": 0.82558548, "learning_rate": 3.3387291075573508e-06, "loss": 0.84675425, "num_input_tokens_seen": 95877875, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.57421875, "step": 4435, "time_per_iteration": 2.3734118938446045 }, { "auxiliary_loss_clip": 0.01086973, "auxiliary_loss_mlp": 0.01043836, "balance_loss_clip": 1.02633619, "balance_loss_mlp": 1.02657735, "epoch": 0.266706748835112, "flos": 27852345406080.0, "grad_norm": 8.327866476886152, "language_loss": 0.88007295, "learning_rate": 3.3384483865758677e-06, "loss": 0.90138102, "num_input_tokens_seen": 95895820, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.60546875, "step": 4436, "time_per_iteration": 2.447432041168213 }, { "auxiliary_loss_clip": 0.01083402, "auxiliary_loss_mlp": 0.01033034, "balance_loss_clip": 1.01635647, "balance_loss_mlp": 1.02495182, "epoch": 0.26676687208777994, "flos": 25810938766080.0, "grad_norm": 1.6499649120763606, "language_loss": 0.78677368, "learning_rate": 3.3381676178286047e-06, "loss": 0.80793798, "num_input_tokens_seen": 95918025, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.5859375, "step": 4437, "time_per_iteration": 2.450538396835327 }, { "auxiliary_loss_clip": 0.01082209, "auxiliary_loss_mlp": 0.01033849, "balance_loss_clip": 1.01734996, "balance_loss_mlp": 1.02518868, "epoch": 0.2668269953404479, "flos": 36063114220800.0, "grad_norm": 1.9899374655950153, "language_loss": 0.63907933, "learning_rate": 3.337886801325582e-06, "loss": 0.66023993, "num_input_tokens_seen": 95937725, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.5703125, "step": 4438, "time_per_iteration": 3.877885341644287 }, { "auxiliary_loss_clip": 0.01082817, "auxiliary_loss_mlp": 0.0103528, "balance_loss_clip": 1.0182811, "balance_loss_mlp": 1.02498817, "epoch": 0.26688711859311587, "flos": 26569985834880.0, "grad_norm": 1.9465374522148835, "language_loss": 0.75652754, "learning_rate": 3.3376059370768202e-06, "loss": 0.77770853, "num_input_tokens_seen": 95956335, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.578125, "step": 4439, "time_per_iteration": 2.3986637592315674 }, { "auxiliary_loss_clip": 0.01084893, "auxiliary_loss_mlp": 0.01032577, "balance_loss_clip": 1.01428986, "balance_loss_mlp": 1.02523625, "epoch": 0.26694724184578383, "flos": 26760331900800.0, "grad_norm": 1.7451553085331943, "language_loss": 0.71426797, "learning_rate": 3.337325025092344e-06, "loss": 0.73544276, "num_input_tokens_seen": 95977135, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.59765625, "step": 4440, "time_per_iteration": 2.4282066822052 }, { "auxiliary_loss_clip": 0.01085171, "auxiliary_loss_mlp": 0.01041042, "balance_loss_clip": 1.02288067, "balance_loss_mlp": 1.02618694, "epoch": 0.2670073650984518, "flos": 20958535186560.0, "grad_norm": 1.8660410835930377, "language_loss": 0.66998219, "learning_rate": 3.337044065382177e-06, "loss": 0.69124424, "num_input_tokens_seen": 95995435, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.58984375, "step": 4441, "time_per_iteration": 3.757274866104126 }, { "auxiliary_loss_clip": 0.01082918, "auxiliary_loss_mlp": 0.01035758, "balance_loss_clip": 1.0182941, "balance_loss_mlp": 1.02545154, "epoch": 0.2670674883511198, "flos": 28364800475520.0, "grad_norm": 1.4717365983727981, "language_loss": 0.76371771, "learning_rate": 3.3367630579563465e-06, "loss": 0.78490448, "num_input_tokens_seen": 96016340, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.57421875, "step": 4442, "time_per_iteration": 2.42803692817688 }, { "auxiliary_loss_clip": 0.01016479, "auxiliary_loss_mlp": 0.01003547, "balance_loss_clip": 1.00148439, "balance_loss_mlp": 1.00325418, "epoch": 0.2671276116037878, "flos": 58968370840320.0, "grad_norm": 0.9241934998108896, "language_loss": 0.61196792, "learning_rate": 3.3364820028248816e-06, "loss": 0.63216817, "num_input_tokens_seen": 96071205, "router_z_loss_clip": 0.02062988, "router_z_loss_mlp": 0.13183594, "step": 4443, "time_per_iteration": 2.927440643310547 }, { "auxiliary_loss_clip": 0.01083943, "auxiliary_loss_mlp": 0.01035382, "balance_loss_clip": 1.01788235, "balance_loss_mlp": 1.02477372, "epoch": 0.26718773485645575, "flos": 43943395305600.0, "grad_norm": 1.5111749557406475, "language_loss": 0.76085961, "learning_rate": 3.336200899997812e-06, "loss": 0.78205287, "num_input_tokens_seen": 96094240, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.59375, "step": 4444, "time_per_iteration": 2.577420949935913 }, { "auxiliary_loss_clip": 0.01084404, "auxiliary_loss_mlp": 0.01031682, "balance_loss_clip": 1.01355004, "balance_loss_mlp": 1.02521873, "epoch": 0.2672478581091237, "flos": 25227156055680.0, "grad_norm": 1.6970284162687346, "language_loss": 0.80564058, "learning_rate": 3.3359197494851687e-06, "loss": 0.82680142, "num_input_tokens_seen": 96114105, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.59375, "step": 4445, "time_per_iteration": 3.7857730388641357 }, { "auxiliary_loss_clip": 0.01083295, "auxiliary_loss_mlp": 0.01027723, "balance_loss_clip": 1.00950766, "balance_loss_mlp": 1.02388334, "epoch": 0.2673079813617917, "flos": 15267273436800.0, "grad_norm": 1.8964928673104666, "language_loss": 0.89086282, "learning_rate": 3.335638551296986e-06, "loss": 0.911973, "num_input_tokens_seen": 96132140, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.59375, "step": 4446, "time_per_iteration": 2.3497743606567383 }, { "auxiliary_loss_clip": 0.01083816, "auxiliary_loss_mlp": 0.01034704, "balance_loss_clip": 1.01788342, "balance_loss_mlp": 1.02553654, "epoch": 0.26736810461445965, "flos": 25811532259200.0, "grad_norm": 1.7236954988024644, "language_loss": 0.67969894, "learning_rate": 3.3353573054432997e-06, "loss": 0.70088416, "num_input_tokens_seen": 96152090, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.58203125, "step": 4447, "time_per_iteration": 2.416245222091675 }, { "auxiliary_loss_clip": 0.01083702, "auxiliary_loss_mlp": 0.01033604, "balance_loss_clip": 1.01583028, "balance_loss_mlp": 1.02472448, "epoch": 0.2674282278671276, "flos": 24311663717760.0, "grad_norm": 1.8990166927242131, "language_loss": 0.83607161, "learning_rate": 3.335076011934146e-06, "loss": 0.85724467, "num_input_tokens_seen": 96170015, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.58984375, "step": 4448, "time_per_iteration": 3.8704729080200195 }, { "auxiliary_loss_clip": 0.0108171, "auxiliary_loss_mlp": 0.01040913, "balance_loss_clip": 1.0238775, "balance_loss_mlp": 1.02470696, "epoch": 0.2674883511197956, "flos": 22814553173760.0, "grad_norm": 1.6268594302914823, "language_loss": 0.84376764, "learning_rate": 3.3347946707795627e-06, "loss": 0.86499381, "num_input_tokens_seen": 96188065, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.5703125, "step": 4449, "time_per_iteration": 2.4050981998443604 }, { "auxiliary_loss_clip": 0.01089157, "auxiliary_loss_mlp": 0.0104556, "balance_loss_clip": 1.02492476, "balance_loss_mlp": 1.02528596, "epoch": 0.26754847437246354, "flos": 25369113110400.0, "grad_norm": 1.6909772018346736, "language_loss": 0.83999503, "learning_rate": 3.3345132819895918e-06, "loss": 0.86134219, "num_input_tokens_seen": 96205780, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 0.63671875, "step": 4450, "time_per_iteration": 2.420567750930786 }, { "auxiliary_loss_clip": 0.01080093, "auxiliary_loss_mlp": 0.0103072, "balance_loss_clip": 1.01474607, "balance_loss_mlp": 1.02417755, "epoch": 0.2676085976251315, "flos": 20229374108160.0, "grad_norm": 1.837832292531925, "language_loss": 0.81029725, "learning_rate": 3.3342318455742748e-06, "loss": 0.8314054, "num_input_tokens_seen": 96224990, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.55859375, "step": 4451, "time_per_iteration": 2.363407850265503 }, { "auxiliary_loss_clip": 0.01081973, "auxiliary_loss_mlp": 0.0103268, "balance_loss_clip": 1.01626468, "balance_loss_mlp": 1.0250107, "epoch": 0.26766872087779947, "flos": 28036966008960.0, "grad_norm": 1.5996466916580925, "language_loss": 0.86545944, "learning_rate": 3.333950361543655e-06, "loss": 0.88660598, "num_input_tokens_seen": 96245345, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.5703125, "step": 4452, "time_per_iteration": 2.436713695526123 }, { "auxiliary_loss_clip": 0.01086419, "auxiliary_loss_mlp": 0.01040497, "balance_loss_clip": 1.02232981, "balance_loss_mlp": 1.02634895, "epoch": 0.26772884413046744, "flos": 18324408528000.0, "grad_norm": 2.205205885981051, "language_loss": 0.83156228, "learning_rate": 3.333668829907778e-06, "loss": 0.85283148, "num_input_tokens_seen": 96259000, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.6015625, "step": 4453, "time_per_iteration": 2.3421266078948975 }, { "auxiliary_loss_clip": 0.01083446, "auxiliary_loss_mlp": 0.01040058, "balance_loss_clip": 1.02271318, "balance_loss_mlp": 1.02626002, "epoch": 0.2677889673831354, "flos": 22126414809600.0, "grad_norm": 1.6591621254335382, "language_loss": 0.79507649, "learning_rate": 3.333387250676692e-06, "loss": 0.8163116, "num_input_tokens_seen": 96277000, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.5703125, "step": 4454, "time_per_iteration": 2.399909734725952 }, { "auxiliary_loss_clip": 0.01085202, "auxiliary_loss_mlp": 0.01034051, "balance_loss_clip": 1.01743317, "balance_loss_mlp": 1.02547109, "epoch": 0.2678490906358034, "flos": 23728649057280.0, "grad_norm": 1.984671084137888, "language_loss": 0.72933257, "learning_rate": 3.3331056238604437e-06, "loss": 0.75052512, "num_input_tokens_seen": 96297010, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.59765625, "step": 4455, "time_per_iteration": 2.3977081775665283 }, { "auxiliary_loss_clip": 0.0107994, "auxiliary_loss_mlp": 0.01033716, "balance_loss_clip": 1.01782513, "balance_loss_mlp": 1.02473307, "epoch": 0.2679092138884714, "flos": 21761781903360.0, "grad_norm": 1.4850696264004095, "language_loss": 0.73548663, "learning_rate": 3.3328239494690856e-06, "loss": 0.75662315, "num_input_tokens_seen": 96315780, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.5546875, "step": 4456, "time_per_iteration": 2.3810369968414307 }, { "auxiliary_loss_clip": 0.01084747, "auxiliary_loss_mlp": 0.01032753, "balance_loss_clip": 1.01532412, "balance_loss_mlp": 1.0246042, "epoch": 0.26796933714113935, "flos": 19860272547840.0, "grad_norm": 2.1995952153618297, "language_loss": 0.70346195, "learning_rate": 3.332542227512669e-06, "loss": 0.72463691, "num_input_tokens_seen": 96333465, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.59765625, "step": 4457, "time_per_iteration": 2.371623992919922 }, { "auxiliary_loss_clip": 0.01084452, "auxiliary_loss_mlp": 0.01031935, "balance_loss_clip": 1.01416063, "balance_loss_mlp": 1.0263896, "epoch": 0.2680294603938073, "flos": 20046848186880.0, "grad_norm": 1.6161514704959818, "language_loss": 0.78763568, "learning_rate": 3.332260458001248e-06, "loss": 0.80879956, "num_input_tokens_seen": 96352005, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.58203125, "step": 4458, "time_per_iteration": 2.3723626136779785 }, { "auxiliary_loss_clip": 0.01015893, "auxiliary_loss_mlp": 0.01003243, "balance_loss_clip": 1.00110888, "balance_loss_mlp": 1.00234771, "epoch": 0.2680895836464753, "flos": 72110237172480.0, "grad_norm": 0.8507323796003085, "language_loss": 0.58605993, "learning_rate": 3.3319786409448776e-06, "loss": 0.6062513, "num_input_tokens_seen": 96406265, "router_z_loss_clip": 0.0213623, "router_z_loss_mlp": 0.13574219, "step": 4459, "time_per_iteration": 3.0016634464263916 }, { "auxiliary_loss_clip": 0.01080804, "auxiliary_loss_mlp": 0.01031638, "balance_loss_clip": 1.0156517, "balance_loss_mlp": 1.02370119, "epoch": 0.26814970689914325, "flos": 20448000241920.0, "grad_norm": 4.775208096770727, "language_loss": 0.85241407, "learning_rate": 3.3316967763536167e-06, "loss": 0.87353843, "num_input_tokens_seen": 96425225, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5703125, "step": 4460, "time_per_iteration": 2.3793869018554688 }, { "auxiliary_loss_clip": 0.0108355, "auxiliary_loss_mlp": 0.01033353, "balance_loss_clip": 1.01696181, "balance_loss_mlp": 1.02480114, "epoch": 0.2682098301518112, "flos": 17565710572800.0, "grad_norm": 2.0053813378551646, "language_loss": 0.68624145, "learning_rate": 3.331414864237523e-06, "loss": 0.70741045, "num_input_tokens_seen": 96443780, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.5859375, "step": 4461, "time_per_iteration": 2.3408210277557373 }, { "auxiliary_loss_clip": 0.01081883, "auxiliary_loss_mlp": 0.01034846, "balance_loss_clip": 1.01706028, "balance_loss_mlp": 1.0243696, "epoch": 0.2682699534044792, "flos": 18332263584000.0, "grad_norm": 1.5110738980636826, "language_loss": 0.67033225, "learning_rate": 3.331132904606658e-06, "loss": 0.69149953, "num_input_tokens_seen": 96464530, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.57421875, "step": 4462, "time_per_iteration": 2.398047685623169 }, { "auxiliary_loss_clip": 0.0108338, "auxiliary_loss_mlp": 0.01036181, "balance_loss_clip": 1.01815701, "balance_loss_mlp": 1.02598441, "epoch": 0.26833007665714714, "flos": 25300124530560.0, "grad_norm": 1.5166486785855047, "language_loss": 0.69246829, "learning_rate": 3.330850897471083e-06, "loss": 0.71366394, "num_input_tokens_seen": 96483345, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.57421875, "step": 4463, "time_per_iteration": 2.4224934577941895 }, { "auxiliary_loss_clip": 0.01084357, "auxiliary_loss_mlp": 0.01033975, "balance_loss_clip": 1.01672494, "balance_loss_mlp": 1.02491117, "epoch": 0.2683901999098151, "flos": 16099044600960.0, "grad_norm": 5.2496402568894025, "language_loss": 0.77723622, "learning_rate": 3.3305688428408634e-06, "loss": 0.79841954, "num_input_tokens_seen": 96498305, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.59375, "step": 4464, "time_per_iteration": 2.3258721828460693 }, { "auxiliary_loss_clip": 0.01081374, "auxiliary_loss_mlp": 0.01030645, "balance_loss_clip": 1.01394343, "balance_loss_mlp": 1.02392387, "epoch": 0.2684503231624831, "flos": 27306827412480.0, "grad_norm": 1.7469384464010271, "language_loss": 0.70595694, "learning_rate": 3.330286740726064e-06, "loss": 0.72707713, "num_input_tokens_seen": 96519740, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.57421875, "step": 4465, "time_per_iteration": 2.4362828731536865 }, { "auxiliary_loss_clip": 0.01014993, "auxiliary_loss_mlp": 0.01005577, "balance_loss_clip": 1.0037415, "balance_loss_mlp": 1.00172997, "epoch": 0.26851044641515104, "flos": 71854498396800.0, "grad_norm": 0.6721368002712322, "language_loss": 0.53067428, "learning_rate": 3.3300045911367527e-06, "loss": 0.55088001, "num_input_tokens_seen": 96588870, "router_z_loss_clip": 0.01831055, "router_z_loss_mlp": 0.1328125, "step": 4466, "time_per_iteration": 3.1842169761657715 }, { "auxiliary_loss_clip": 0.01080139, "auxiliary_loss_mlp": 0.01036146, "balance_loss_clip": 1.01872981, "balance_loss_mlp": 1.02496719, "epoch": 0.268570569667819, "flos": 18732787234560.0, "grad_norm": 1.793836553207938, "language_loss": 0.74083984, "learning_rate": 3.3297223940829993e-06, "loss": 0.76200271, "num_input_tokens_seen": 96605100, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.55078125, "step": 4467, "time_per_iteration": 2.346865177154541 }, { "auxiliary_loss_clip": 0.01083256, "auxiliary_loss_mlp": 0.01036764, "balance_loss_clip": 1.019526, "balance_loss_mlp": 1.02402246, "epoch": 0.268630692920487, "flos": 18177633705600.0, "grad_norm": 2.191278880275011, "language_loss": 0.80200934, "learning_rate": 3.3294401495748733e-06, "loss": 0.82320952, "num_input_tokens_seen": 96621410, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.59375, "step": 4468, "time_per_iteration": 2.3384740352630615 }, { "auxiliary_loss_clip": 0.01014854, "auxiliary_loss_mlp": 0.01003605, "balance_loss_clip": 1.00197148, "balance_loss_mlp": 1.00198507, "epoch": 0.268690816173155, "flos": 68728025612160.0, "grad_norm": 0.8446363794832309, "language_loss": 0.594836, "learning_rate": 3.3291578576224487e-06, "loss": 0.61502063, "num_input_tokens_seen": 96684810, "router_z_loss_clip": 0.01635742, "router_z_loss_mlp": 0.12890625, "step": 4469, "time_per_iteration": 3.1167914867401123 }, { "auxiliary_loss_clip": 0.01084916, "auxiliary_loss_mlp": 0.01037397, "balance_loss_clip": 1.01933658, "balance_loss_mlp": 1.02664685, "epoch": 0.26875093942582295, "flos": 23292548864640.0, "grad_norm": 2.008031001637529, "language_loss": 0.81701338, "learning_rate": 3.328875518235799e-06, "loss": 0.83823645, "num_input_tokens_seen": 96701920, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.58203125, "step": 4470, "time_per_iteration": 2.3879146575927734 }, { "auxiliary_loss_clip": 0.01077523, "auxiliary_loss_mlp": 0.01031587, "balance_loss_clip": 1.01607752, "balance_loss_mlp": 1.02323973, "epoch": 0.2688110626784909, "flos": 21542387719680.0, "grad_norm": 1.6038821516622725, "language_loss": 0.82837486, "learning_rate": 3.328593131425e-06, "loss": 0.84946591, "num_input_tokens_seen": 96721260, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.54296875, "step": 4471, "time_per_iteration": 2.3821933269500732 }, { "auxiliary_loss_clip": 0.0108063, "auxiliary_loss_mlp": 0.01033538, "balance_loss_clip": 1.017838, "balance_loss_mlp": 1.02507091, "epoch": 0.2688711859311589, "flos": 28399399499520.0, "grad_norm": 1.9848390976572985, "language_loss": 0.69358706, "learning_rate": 3.3283106972001303e-06, "loss": 0.71472877, "num_input_tokens_seen": 96740385, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.5546875, "step": 4472, "time_per_iteration": 2.44398832321167 }, { "auxiliary_loss_clip": 0.0108238, "auxiliary_loss_mlp": 0.01036086, "balance_loss_clip": 1.02001083, "balance_loss_mlp": 1.02520609, "epoch": 0.26893130918382685, "flos": 25993743978240.0, "grad_norm": 1.6237287978466413, "language_loss": 0.67831284, "learning_rate": 3.3280282155712684e-06, "loss": 0.69949752, "num_input_tokens_seen": 96761860, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.5703125, "step": 4473, "time_per_iteration": 2.4261202812194824 }, { "auxiliary_loss_clip": 0.01079509, "auxiliary_loss_mlp": 0.01039558, "balance_loss_clip": 1.02351272, "balance_loss_mlp": 1.02467394, "epoch": 0.2689914324364948, "flos": 20338582440960.0, "grad_norm": 1.6609874782940333, "language_loss": 0.82910681, "learning_rate": 3.3277456865484956e-06, "loss": 0.85029745, "num_input_tokens_seen": 96781890, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.546875, "step": 4474, "time_per_iteration": 2.38499116897583 }, { "auxiliary_loss_clip": 0.01079055, "auxiliary_loss_mlp": 0.01040125, "balance_loss_clip": 1.02459157, "balance_loss_mlp": 1.02461219, "epoch": 0.2690515556891628, "flos": 19463519324160.0, "grad_norm": 1.931247857755048, "language_loss": 0.70672166, "learning_rate": 3.3274631101418942e-06, "loss": 0.7279135, "num_input_tokens_seen": 96800390, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.54296875, "step": 4475, "time_per_iteration": 2.363485336303711 }, { "auxiliary_loss_clip": 0.01080789, "auxiliary_loss_mlp": 0.01038167, "balance_loss_clip": 1.02138877, "balance_loss_mlp": 1.02387619, "epoch": 0.26911167894183075, "flos": 18145757767680.0, "grad_norm": 1.6858005728302072, "language_loss": 0.72981411, "learning_rate": 3.32718048636155e-06, "loss": 0.75100362, "num_input_tokens_seen": 96816685, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.5703125, "step": 4476, "time_per_iteration": 2.3534677028656006 }, { "auxiliary_loss_clip": 0.01078047, "auxiliary_loss_mlp": 0.01030438, "balance_loss_clip": 1.01535809, "balance_loss_mlp": 1.02385592, "epoch": 0.2691718021944987, "flos": 19974089180160.0, "grad_norm": 1.7933882984678577, "language_loss": 0.81010187, "learning_rate": 3.3268978152175474e-06, "loss": 0.83118671, "num_input_tokens_seen": 96836285, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.54296875, "step": 4477, "time_per_iteration": 2.430464744567871 }, { "auxiliary_loss_clip": 0.01080129, "auxiliary_loss_mlp": 0.01035734, "balance_loss_clip": 1.0198555, "balance_loss_mlp": 1.02281356, "epoch": 0.2692319254471667, "flos": 37445814639360.0, "grad_norm": 1.5332979734477428, "language_loss": 0.64887929, "learning_rate": 3.3266150967199752e-06, "loss": 0.67003787, "num_input_tokens_seen": 96857745, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.5703125, "step": 4478, "time_per_iteration": 3.8750460147857666 }, { "auxiliary_loss_clip": 0.01079824, "auxiliary_loss_mlp": 0.01032956, "balance_loss_clip": 1.01612389, "balance_loss_mlp": 1.02383912, "epoch": 0.26929204869983464, "flos": 22126694100480.0, "grad_norm": 1.9379773878178308, "language_loss": 0.80459404, "learning_rate": 3.3263323308789225e-06, "loss": 0.82572174, "num_input_tokens_seen": 96877295, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.55859375, "step": 4479, "time_per_iteration": 2.3893632888793945 }, { "auxiliary_loss_clip": 0.01081605, "auxiliary_loss_mlp": 0.01037132, "balance_loss_clip": 1.02120578, "balance_loss_mlp": 1.02322698, "epoch": 0.2693521719525026, "flos": 19791772727040.0, "grad_norm": 2.4539283943618018, "language_loss": 0.80995786, "learning_rate": 3.3260495177044806e-06, "loss": 0.83114529, "num_input_tokens_seen": 96896160, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.5859375, "step": 4480, "time_per_iteration": 2.3738739490509033 }, { "auxiliary_loss_clip": 0.01075662, "auxiliary_loss_mlp": 0.01028928, "balance_loss_clip": 1.01440787, "balance_loss_mlp": 1.02309871, "epoch": 0.2694122952051706, "flos": 20993378590080.0, "grad_norm": 1.5241829458860563, "language_loss": 0.78021401, "learning_rate": 3.325766657206743e-06, "loss": 0.80125993, "num_input_tokens_seen": 96915410, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.5234375, "step": 4481, "time_per_iteration": 3.7754180431365967 }, { "auxiliary_loss_clip": 0.01081858, "auxiliary_loss_mlp": 0.01036941, "balance_loss_clip": 1.02078819, "balance_loss_mlp": 1.02476931, "epoch": 0.2694724184578386, "flos": 25848086319360.0, "grad_norm": 1.9495649123639187, "language_loss": 0.7387563, "learning_rate": 3.3254837493958032e-06, "loss": 0.75994426, "num_input_tokens_seen": 96937865, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.5703125, "step": 4482, "time_per_iteration": 2.448399543762207 }, { "auxiliary_loss_clip": 0.01083359, "auxiliary_loss_mlp": 0.01030811, "balance_loss_clip": 1.01433611, "balance_loss_mlp": 1.02650142, "epoch": 0.26953254171050656, "flos": 21725856247680.0, "grad_norm": 1.8219840131068075, "language_loss": 0.72292948, "learning_rate": 3.3252007942817575e-06, "loss": 0.74407113, "num_input_tokens_seen": 96957710, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.5703125, "step": 4483, "time_per_iteration": 2.3748221397399902 }, { "auxiliary_loss_clip": 0.01083569, "auxiliary_loss_mlp": 0.01036102, "balance_loss_clip": 1.0182085, "balance_loss_mlp": 1.02419567, "epoch": 0.2695926649631745, "flos": 19681901078400.0, "grad_norm": 3.6791924321997707, "language_loss": 0.86999154, "learning_rate": 3.324917791874705e-06, "loss": 0.8911882, "num_input_tokens_seen": 96975890, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.59375, "step": 4484, "time_per_iteration": 3.836228132247925 }, { "auxiliary_loss_clip": 0.01082217, "auxiliary_loss_mlp": 0.01030014, "balance_loss_clip": 1.01475477, "balance_loss_mlp": 1.02469516, "epoch": 0.2696527882158425, "flos": 32885319870720.0, "grad_norm": 1.4598561573104194, "language_loss": 0.66220701, "learning_rate": 3.324634742184744e-06, "loss": 0.68332934, "num_input_tokens_seen": 96998595, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.578125, "step": 4485, "time_per_iteration": 2.4773032665252686 }, { "auxiliary_loss_clip": 0.0108219, "auxiliary_loss_mlp": 0.01034617, "balance_loss_clip": 1.01782048, "balance_loss_mlp": 1.02494526, "epoch": 0.26971291146851045, "flos": 12124182844800.0, "grad_norm": 2.2793826971817217, "language_loss": 0.72640491, "learning_rate": 3.324351645221977e-06, "loss": 0.74757296, "num_input_tokens_seen": 97013715, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.5703125, "step": 4486, "time_per_iteration": 2.3354594707489014 }, { "auxiliary_loss_clip": 0.01087052, "auxiliary_loss_mlp": 0.01038748, "balance_loss_clip": 1.02220225, "balance_loss_mlp": 1.02663565, "epoch": 0.2697730347211784, "flos": 22633458618240.0, "grad_norm": 1.7549295907656801, "language_loss": 0.84021676, "learning_rate": 3.3240685009965065e-06, "loss": 0.86147475, "num_input_tokens_seen": 97031570, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.60546875, "step": 4487, "time_per_iteration": 2.3800137042999268 }, { "auxiliary_loss_clip": 0.01082813, "auxiliary_loss_mlp": 0.01036629, "balance_loss_clip": 1.0196538, "balance_loss_mlp": 1.02519119, "epoch": 0.2698331579738464, "flos": 23511943048320.0, "grad_norm": 2.158231889353338, "language_loss": 0.71825504, "learning_rate": 3.3237853095184365e-06, "loss": 0.73944944, "num_input_tokens_seen": 97049815, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.57421875, "step": 4488, "time_per_iteration": 3.789289712905884 }, { "auxiliary_loss_clip": 0.01082345, "auxiliary_loss_mlp": 0.01036868, "balance_loss_clip": 1.0201546, "balance_loss_mlp": 1.02542758, "epoch": 0.26989328122651435, "flos": 24639986943360.0, "grad_norm": 1.7189116236507618, "language_loss": 0.83743238, "learning_rate": 3.3235020707978747e-06, "loss": 0.85862446, "num_input_tokens_seen": 97067570, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.5703125, "step": 4489, "time_per_iteration": 2.407876491546631 }, { "auxiliary_loss_clip": 0.01084075, "auxiliary_loss_mlp": 0.01038257, "balance_loss_clip": 1.02117491, "balance_loss_mlp": 1.02525234, "epoch": 0.2699534044791823, "flos": 10771996821120.0, "grad_norm": 2.4911137031080997, "language_loss": 0.89687502, "learning_rate": 3.323218784844928e-06, "loss": 0.91809833, "num_input_tokens_seen": 97082180, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.5859375, "step": 4490, "time_per_iteration": 2.357886791229248 }, { "auxiliary_loss_clip": 0.01079328, "auxiliary_loss_mlp": 0.01032723, "balance_loss_clip": 1.01703525, "balance_loss_mlp": 1.02451301, "epoch": 0.2700135277318503, "flos": 36170192960640.0, "grad_norm": 1.9868071240327854, "language_loss": 0.73218596, "learning_rate": 3.322935451669706e-06, "loss": 0.75330651, "num_input_tokens_seen": 97103470, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.546875, "step": 4491, "time_per_iteration": 2.493103504180908 }, { "auxiliary_loss_clip": 0.01085746, "auxiliary_loss_mlp": 0.0104042, "balance_loss_clip": 1.0233376, "balance_loss_mlp": 1.02712989, "epoch": 0.27007365098451824, "flos": 17417713852800.0, "grad_norm": 2.7418608079080005, "language_loss": 0.74333012, "learning_rate": 3.322652071282322e-06, "loss": 0.76459181, "num_input_tokens_seen": 97118100, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.5859375, "step": 4492, "time_per_iteration": 2.3537187576293945 }, { "auxiliary_loss_clip": 0.01082365, "auxiliary_loss_mlp": 0.01030478, "balance_loss_clip": 1.01431274, "balance_loss_mlp": 1.0255754, "epoch": 0.2701337742371862, "flos": 23184562429440.0, "grad_norm": 1.9799655065005513, "language_loss": 0.88832211, "learning_rate": 3.3223686436928874e-06, "loss": 0.90945053, "num_input_tokens_seen": 97136765, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.5703125, "step": 4493, "time_per_iteration": 2.388761281967163 }, { "auxiliary_loss_clip": 0.0108122, "auxiliary_loss_mlp": 0.01032849, "balance_loss_clip": 1.01787066, "balance_loss_mlp": 1.02600765, "epoch": 0.2701938974898542, "flos": 24388297885440.0, "grad_norm": 1.453558530019349, "language_loss": 0.71056789, "learning_rate": 3.322085168911517e-06, "loss": 0.73170859, "num_input_tokens_seen": 97157470, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.55078125, "step": 4494, "time_per_iteration": 2.4287607669830322 }, { "auxiliary_loss_clip": 0.01077588, "auxiliary_loss_mlp": 0.01035566, "balance_loss_clip": 1.01991987, "balance_loss_mlp": 1.02271843, "epoch": 0.2702540207425222, "flos": 26213103250560.0, "grad_norm": 1.9191194347989033, "language_loss": 0.8621546, "learning_rate": 3.321801646948328e-06, "loss": 0.88328612, "num_input_tokens_seen": 97176905, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.546875, "step": 4495, "time_per_iteration": 2.4040403366088867 }, { "auxiliary_loss_clip": 0.01081717, "auxiliary_loss_mlp": 0.01032528, "balance_loss_clip": 1.01673281, "balance_loss_mlp": 1.02571392, "epoch": 0.27031414399519016, "flos": 22925367429120.0, "grad_norm": 1.605148411167534, "language_loss": 0.76578534, "learning_rate": 3.321518077813438e-06, "loss": 0.78692782, "num_input_tokens_seen": 97196380, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.5625, "step": 4496, "time_per_iteration": 2.4081809520721436 }, { "auxiliary_loss_clip": 0.01015898, "auxiliary_loss_mlp": 0.01008557, "balance_loss_clip": 1.00716174, "balance_loss_mlp": 1.0027616, "epoch": 0.2703742672478581, "flos": 63016759653120.0, "grad_norm": 0.7033280233919958, "language_loss": 0.50172031, "learning_rate": 3.321234461516967e-06, "loss": 0.52196479, "num_input_tokens_seen": 97260100, "router_z_loss_clip": 0.01397705, "router_z_loss_mlp": 0.13085938, "step": 4497, "time_per_iteration": 3.089003086090088 }, { "auxiliary_loss_clip": 0.01083106, "auxiliary_loss_mlp": 0.01036326, "balance_loss_clip": 1.02027988, "balance_loss_mlp": 1.02663255, "epoch": 0.2704343905005261, "flos": 18839900885760.0, "grad_norm": 1.5242792975239683, "language_loss": 0.72006714, "learning_rate": 3.3209507980690375e-06, "loss": 0.74126148, "num_input_tokens_seen": 97277935, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.56640625, "step": 4498, "time_per_iteration": 2.4045588970184326 }, { "auxiliary_loss_clip": 0.01015125, "auxiliary_loss_mlp": 0.0100553, "balance_loss_clip": 1.00398076, "balance_loss_mlp": 1.00229216, "epoch": 0.27049451375319405, "flos": 71230042085760.0, "grad_norm": 0.7530257662081888, "language_loss": 0.59190583, "learning_rate": 3.3206670874797717e-06, "loss": 0.6121124, "num_input_tokens_seen": 97338845, "router_z_loss_clip": 0.01544189, "router_z_loss_mlp": 0.12890625, "step": 4499, "time_per_iteration": 3.0591161251068115 }, { "auxiliary_loss_clip": 0.01079214, "auxiliary_loss_mlp": 0.01030893, "balance_loss_clip": 1.01448965, "balance_loss_mlp": 1.02431273, "epoch": 0.270554637005862, "flos": 24277483630080.0, "grad_norm": 1.8792993719860758, "language_loss": 0.7363081, "learning_rate": 3.3203833297592943e-06, "loss": 0.7574091, "num_input_tokens_seen": 97356640, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.546875, "step": 4500, "time_per_iteration": 2.4270050525665283 }, { "auxiliary_loss_clip": 0.01079428, "auxiliary_loss_mlp": 0.01036149, "balance_loss_clip": 1.02034807, "balance_loss_mlp": 1.02346158, "epoch": 0.27061476025853, "flos": 17631557130240.0, "grad_norm": 2.8435527376846386, "language_loss": 0.80917323, "learning_rate": 3.3200995249177324e-06, "loss": 0.830329, "num_input_tokens_seen": 97372585, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.55859375, "step": 4501, "time_per_iteration": 2.3408093452453613 }, { "auxiliary_loss_clip": 0.0108064, "auxiliary_loss_mlp": 0.01034326, "balance_loss_clip": 1.01761258, "balance_loss_mlp": 1.02366543, "epoch": 0.27067488351119795, "flos": 22709045445120.0, "grad_norm": 2.0256480088020057, "language_loss": 0.72636497, "learning_rate": 3.3198156729652144e-06, "loss": 0.74751461, "num_input_tokens_seen": 97393315, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.5703125, "step": 4502, "time_per_iteration": 2.4176254272460938 }, { "auxiliary_loss_clip": 0.01081711, "auxiliary_loss_mlp": 0.0103455, "balance_loss_clip": 1.01634669, "balance_loss_mlp": 1.02251172, "epoch": 0.2707350067638659, "flos": 41717996467200.0, "grad_norm": 1.7990912716135352, "language_loss": 0.68265796, "learning_rate": 3.31953177391187e-06, "loss": 0.70382053, "num_input_tokens_seen": 97417860, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.59375, "step": 4503, "time_per_iteration": 2.562025785446167 }, { "auxiliary_loss_clip": 0.01080471, "auxiliary_loss_mlp": 0.01031379, "balance_loss_clip": 1.01566768, "balance_loss_mlp": 1.02419126, "epoch": 0.2707951300165339, "flos": 20192017086720.0, "grad_norm": 1.8419824353840168, "language_loss": 0.67978293, "learning_rate": 3.319247827767831e-06, "loss": 0.70090151, "num_input_tokens_seen": 97436780, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.5625, "step": 4504, "time_per_iteration": 2.3766422271728516 }, { "auxiliary_loss_clip": 0.01081967, "auxiliary_loss_mlp": 0.01034347, "balance_loss_clip": 1.01815891, "balance_loss_mlp": 1.02605009, "epoch": 0.27085525326920185, "flos": 21432900096000.0, "grad_norm": 1.4023218067460208, "language_loss": 0.75659472, "learning_rate": 3.31896383454323e-06, "loss": 0.77775782, "num_input_tokens_seen": 97456190, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.55859375, "step": 4505, "time_per_iteration": 2.384819746017456 }, { "auxiliary_loss_clip": 0.01086919, "auxiliary_loss_mlp": 0.01041346, "balance_loss_clip": 1.02313077, "balance_loss_mlp": 1.02600741, "epoch": 0.2709153765218698, "flos": 17674290501120.0, "grad_norm": 2.908619247116778, "language_loss": 0.73532224, "learning_rate": 3.3186797942482025e-06, "loss": 0.75660491, "num_input_tokens_seen": 97474545, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.609375, "step": 4506, "time_per_iteration": 2.3712241649627686 }, { "auxiliary_loss_clip": 0.01083816, "auxiliary_loss_mlp": 0.01033292, "balance_loss_clip": 1.0161016, "balance_loss_mlp": 1.02535522, "epoch": 0.2709754997745378, "flos": 24455261606400.0, "grad_norm": 1.9930554118530837, "language_loss": 0.80757821, "learning_rate": 3.3183957068928855e-06, "loss": 0.8287493, "num_input_tokens_seen": 97494520, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.5859375, "step": 4507, "time_per_iteration": 2.4048187732696533 }, { "auxiliary_loss_clip": 0.01015596, "auxiliary_loss_mlp": 0.01003149, "balance_loss_clip": 1.00131321, "balance_loss_mlp": 1.00270295, "epoch": 0.2710356230272058, "flos": 65207664201600.0, "grad_norm": 0.7384456755439164, "language_loss": 0.50821882, "learning_rate": 3.318111572487417e-06, "loss": 0.52840626, "num_input_tokens_seen": 97552455, "router_z_loss_clip": 0.01831055, "router_z_loss_mlp": 0.12890625, "step": 4508, "time_per_iteration": 2.949289083480835 }, { "auxiliary_loss_clip": 0.01078772, "auxiliary_loss_mlp": 0.01030216, "balance_loss_clip": 1.01543427, "balance_loss_mlp": 1.02388668, "epoch": 0.27109574627987376, "flos": 25483243944960.0, "grad_norm": 2.124940899839455, "language_loss": 0.74865228, "learning_rate": 3.3178273910419376e-06, "loss": 0.76974213, "num_input_tokens_seen": 97572650, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.546875, "step": 4509, "time_per_iteration": 2.4222006797790527 }, { "auxiliary_loss_clip": 0.01077173, "auxiliary_loss_mlp": 0.01032298, "balance_loss_clip": 1.01814198, "balance_loss_mlp": 1.02336025, "epoch": 0.2711558695325417, "flos": 19681761432960.0, "grad_norm": 1.9374771585781625, "language_loss": 0.71486199, "learning_rate": 3.3175431625665876e-06, "loss": 0.73595667, "num_input_tokens_seen": 97591150, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.5390625, "step": 4510, "time_per_iteration": 2.387803316116333 }, { "auxiliary_loss_clip": 0.01081239, "auxiliary_loss_mlp": 0.01029419, "balance_loss_clip": 1.01385069, "balance_loss_mlp": 1.02615893, "epoch": 0.2712159927852097, "flos": 18586780462080.0, "grad_norm": 2.3580156690873473, "language_loss": 0.69878447, "learning_rate": 3.317258887071512e-06, "loss": 0.71989107, "num_input_tokens_seen": 97607410, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.55078125, "step": 4511, "time_per_iteration": 2.3518013954162598 }, { "auxiliary_loss_clip": 0.01082062, "auxiliary_loss_mlp": 0.01035458, "balance_loss_clip": 1.01885176, "balance_loss_mlp": 1.02499878, "epoch": 0.27127611603787766, "flos": 25629041249280.0, "grad_norm": 1.9857478699695057, "language_loss": 0.80716193, "learning_rate": 3.3169745645668546e-06, "loss": 0.82833719, "num_input_tokens_seen": 97626870, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.5703125, "step": 4512, "time_per_iteration": 2.4412453174591064 }, { "auxiliary_loss_clip": 0.01077305, "auxiliary_loss_mlp": 0.01025574, "balance_loss_clip": 1.01119757, "balance_loss_mlp": 1.02384079, "epoch": 0.2713362392905456, "flos": 23147833812480.0, "grad_norm": 1.585749548534528, "language_loss": 0.80106896, "learning_rate": 3.3166901950627627e-06, "loss": 0.82209772, "num_input_tokens_seen": 97646595, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.53515625, "step": 4513, "time_per_iteration": 2.399441957473755 }, { "auxiliary_loss_clip": 0.01080465, "auxiliary_loss_mlp": 0.0102953, "balance_loss_clip": 1.01450968, "balance_loss_mlp": 1.02370095, "epoch": 0.2713963625432136, "flos": 18365151951360.0, "grad_norm": 1.8860261571152785, "language_loss": 0.88569802, "learning_rate": 3.3164057785693846e-06, "loss": 0.90679801, "num_input_tokens_seen": 97665485, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.5703125, "step": 4514, "time_per_iteration": 2.365255832672119 }, { "auxiliary_loss_clip": 0.01081314, "auxiliary_loss_mlp": 0.01036157, "balance_loss_clip": 1.02045166, "balance_loss_mlp": 1.02581787, "epoch": 0.27145648579588155, "flos": 22490663690880.0, "grad_norm": 3.4338607898771696, "language_loss": 0.92019075, "learning_rate": 3.316121315096871e-06, "loss": 0.94136542, "num_input_tokens_seen": 97683800, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.5546875, "step": 4515, "time_per_iteration": 2.397843360900879 }, { "auxiliary_loss_clip": 0.01087085, "auxiliary_loss_mlp": 0.01042334, "balance_loss_clip": 1.02433372, "balance_loss_mlp": 1.02682376, "epoch": 0.2715166090485495, "flos": 19238329854720.0, "grad_norm": 2.0003139354196002, "language_loss": 0.7318427, "learning_rate": 3.3158368046553724e-06, "loss": 0.75313687, "num_input_tokens_seen": 97700505, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.60546875, "step": 4516, "time_per_iteration": 2.380821704864502 }, { "auxiliary_loss_clip": 0.01081555, "auxiliary_loss_mlp": 0.01032495, "balance_loss_clip": 1.01571, "balance_loss_mlp": 1.02499664, "epoch": 0.2715767323012175, "flos": 17708714968320.0, "grad_norm": 1.8621901672453118, "language_loss": 0.7606324, "learning_rate": 3.315552247255043e-06, "loss": 0.78177291, "num_input_tokens_seen": 97717410, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.56640625, "step": 4517, "time_per_iteration": 3.725590467453003 }, { "auxiliary_loss_clip": 0.01080733, "auxiliary_loss_mlp": 0.01033027, "balance_loss_clip": 1.01642156, "balance_loss_mlp": 1.02456748, "epoch": 0.27163685555388545, "flos": 22381734648960.0, "grad_norm": 2.4437410929350625, "language_loss": 0.77187204, "learning_rate": 3.3152676429060385e-06, "loss": 0.79300964, "num_input_tokens_seen": 97734545, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.5625, "step": 4518, "time_per_iteration": 2.3933472633361816 }, { "auxiliary_loss_clip": 0.01080056, "auxiliary_loss_mlp": 0.01027976, "balance_loss_clip": 1.01262212, "balance_loss_mlp": 1.02426314, "epoch": 0.2716969788065534, "flos": 22345599525120.0, "grad_norm": 1.6206087204756392, "language_loss": 0.68530637, "learning_rate": 3.3149829916185147e-06, "loss": 0.70638669, "num_input_tokens_seen": 97754000, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.55859375, "step": 4519, "time_per_iteration": 2.3842036724090576 }, { "auxiliary_loss_clip": 0.01079199, "auxiliary_loss_mlp": 0.01029333, "balance_loss_clip": 1.01446807, "balance_loss_mlp": 1.02388358, "epoch": 0.2717571020592214, "flos": 25227295701120.0, "grad_norm": 2.082781881877512, "language_loss": 0.75273436, "learning_rate": 3.314698293402631e-06, "loss": 0.77381968, "num_input_tokens_seen": 97772080, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.5546875, "step": 4520, "time_per_iteration": 3.889113426208496 }, { "auxiliary_loss_clip": 0.01085247, "auxiliary_loss_mlp": 0.01038384, "balance_loss_clip": 1.02028799, "balance_loss_mlp": 1.02649176, "epoch": 0.2718172253118894, "flos": 20188840152960.0, "grad_norm": 2.0582810574866595, "language_loss": 0.76261693, "learning_rate": 3.314413548268546e-06, "loss": 0.78385329, "num_input_tokens_seen": 97789370, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.5859375, "step": 4521, "time_per_iteration": 2.3611133098602295 }, { "auxiliary_loss_clip": 0.01081571, "auxiliary_loss_mlp": 0.01036065, "balance_loss_clip": 1.01924443, "balance_loss_mlp": 1.02396441, "epoch": 0.27187734856455736, "flos": 14318264327040.0, "grad_norm": 2.250188091427427, "language_loss": 0.74971151, "learning_rate": 3.3141287562264232e-06, "loss": 0.77088785, "num_input_tokens_seen": 97807385, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.578125, "step": 4522, "time_per_iteration": 2.402076005935669 }, { "auxiliary_loss_clip": 0.01084066, "auxiliary_loss_mlp": 0.01029721, "balance_loss_clip": 1.01403904, "balance_loss_mlp": 1.02588773, "epoch": 0.27193747181722533, "flos": 21106566817920.0, "grad_norm": 1.922816484571974, "language_loss": 0.72921813, "learning_rate": 3.3138439172864258e-06, "loss": 0.75035608, "num_input_tokens_seen": 97827930, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.58203125, "step": 4523, "time_per_iteration": 2.3831393718719482 }, { "auxiliary_loss_clip": 0.01078028, "auxiliary_loss_mlp": 0.01035109, "balance_loss_clip": 1.01927781, "balance_loss_mlp": 1.02376866, "epoch": 0.2719975950698933, "flos": 19681761432960.0, "grad_norm": 1.4698126783393182, "language_loss": 0.74400955, "learning_rate": 3.313559031458718e-06, "loss": 0.76514089, "num_input_tokens_seen": 97847440, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.54296875, "step": 4524, "time_per_iteration": 3.7996976375579834 }, { "auxiliary_loss_clip": 0.01080931, "auxiliary_loss_mlp": 0.01032492, "balance_loss_clip": 1.01647043, "balance_loss_mlp": 1.02484608, "epoch": 0.27205771832256126, "flos": 24753314816640.0, "grad_norm": 2.4126873356979854, "language_loss": 0.76205564, "learning_rate": 3.313274098753467e-06, "loss": 0.78318995, "num_input_tokens_seen": 97867620, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5625, "step": 4525, "time_per_iteration": 2.404662609100342 }, { "auxiliary_loss_clip": 0.01080402, "auxiliary_loss_mlp": 0.01035931, "balance_loss_clip": 1.02060056, "balance_loss_mlp": 1.02542555, "epoch": 0.2721178415752292, "flos": 21754694897280.0, "grad_norm": 2.314706947911429, "language_loss": 0.81361967, "learning_rate": 3.3129891191808423e-06, "loss": 0.83478296, "num_input_tokens_seen": 97884345, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.546875, "step": 4526, "time_per_iteration": 2.4147696495056152 }, { "auxiliary_loss_clip": 0.01084144, "auxiliary_loss_mlp": 0.01032727, "balance_loss_clip": 1.01467907, "balance_loss_mlp": 1.0242039, "epoch": 0.2721779648278972, "flos": 12676019794560.0, "grad_norm": 1.9890912207025393, "language_loss": 0.76664495, "learning_rate": 3.312704092751013e-06, "loss": 0.78781366, "num_input_tokens_seen": 97901500, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.6015625, "step": 4527, "time_per_iteration": 3.844364643096924 }, { "auxiliary_loss_clip": 0.01079931, "auxiliary_loss_mlp": 0.0103544, "balance_loss_clip": 1.01782107, "balance_loss_mlp": 1.02274859, "epoch": 0.27223808808056515, "flos": 16252278024960.0, "grad_norm": 1.9572044950173368, "language_loss": 0.82010406, "learning_rate": 3.312419019474151e-06, "loss": 0.84125781, "num_input_tokens_seen": 97917800, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.5703125, "step": 4528, "time_per_iteration": 2.352949380874634 }, { "auxiliary_loss_clip": 0.01079986, "auxiliary_loss_mlp": 0.01038852, "balance_loss_clip": 1.02316439, "balance_loss_mlp": 1.02441454, "epoch": 0.2722982113332331, "flos": 27744568439040.0, "grad_norm": 2.253802296169956, "language_loss": 0.77230215, "learning_rate": 3.3121338993604306e-06, "loss": 0.79349053, "num_input_tokens_seen": 97937225, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.5546875, "step": 4529, "time_per_iteration": 2.4298012256622314 }, { "auxiliary_loss_clip": 0.0108186, "auxiliary_loss_mlp": 0.01037919, "balance_loss_clip": 1.02209949, "balance_loss_mlp": 1.02347469, "epoch": 0.2723583345859011, "flos": 21725158020480.0, "grad_norm": 1.8764821701819914, "language_loss": 0.82439351, "learning_rate": 3.3118487324200267e-06, "loss": 0.84559131, "num_input_tokens_seen": 97956845, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.5859375, "step": 4530, "time_per_iteration": 2.3779683113098145 }, { "auxiliary_loss_clip": 0.01079985, "auxiliary_loss_mlp": 0.01034443, "balance_loss_clip": 1.01832604, "balance_loss_mlp": 1.02333164, "epoch": 0.27241845783856905, "flos": 17346316389120.0, "grad_norm": 2.0206716892452983, "language_loss": 0.91379881, "learning_rate": 3.3115635186631156e-06, "loss": 0.93494308, "num_input_tokens_seen": 97972465, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.56640625, "step": 4531, "time_per_iteration": 2.35880708694458 }, { "auxiliary_loss_clip": 0.01081303, "auxiliary_loss_mlp": 0.01038301, "balance_loss_clip": 1.02195787, "balance_loss_mlp": 1.02361417, "epoch": 0.272478581091237, "flos": 24753140259840.0, "grad_norm": 2.0681426646192302, "language_loss": 0.76825052, "learning_rate": 3.3112782580998767e-06, "loss": 0.78944653, "num_input_tokens_seen": 97990770, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.578125, "step": 4532, "time_per_iteration": 2.401804208755493 }, { "auxiliary_loss_clip": 0.01077176, "auxiliary_loss_mlp": 0.0103064, "balance_loss_clip": 1.01507747, "balance_loss_mlp": 1.02413034, "epoch": 0.272538704343905, "flos": 17889774612480.0, "grad_norm": 2.8403586954987157, "language_loss": 0.8898015, "learning_rate": 3.3109929507404895e-06, "loss": 0.91087973, "num_input_tokens_seen": 98005775, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.53125, "step": 4533, "time_per_iteration": 2.3338263034820557 }, { "auxiliary_loss_clip": 0.01078865, "auxiliary_loss_mlp": 0.01027358, "balance_loss_clip": 1.0114255, "balance_loss_mlp": 1.02390444, "epoch": 0.272598827596573, "flos": 22930848512640.0, "grad_norm": 1.933835864049127, "language_loss": 0.71540821, "learning_rate": 3.3107075965951355e-06, "loss": 0.73647046, "num_input_tokens_seen": 98025750, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.55078125, "step": 4534, "time_per_iteration": 2.399620771408081 }, { "auxiliary_loss_clip": 0.01080491, "auxiliary_loss_mlp": 0.01036193, "balance_loss_clip": 1.01826429, "balance_loss_mlp": 1.02281284, "epoch": 0.27265895084924097, "flos": 24237403522560.0, "grad_norm": 2.4880431962628156, "language_loss": 0.91023898, "learning_rate": 3.3104221956739996e-06, "loss": 0.93140578, "num_input_tokens_seen": 98044955, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.578125, "step": 4535, "time_per_iteration": 2.3948521614074707 }, { "auxiliary_loss_clip": 0.01083294, "auxiliary_loss_mlp": 0.01031139, "balance_loss_clip": 1.01386571, "balance_loss_mlp": 1.02550101, "epoch": 0.27271907410190893, "flos": 27012020958720.0, "grad_norm": 1.7963651630385258, "language_loss": 0.73164904, "learning_rate": 3.3101367479872667e-06, "loss": 0.75279337, "num_input_tokens_seen": 98065860, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.578125, "step": 4536, "time_per_iteration": 2.4268381595611572 }, { "auxiliary_loss_clip": 0.01079914, "auxiliary_loss_mlp": 0.0103043, "balance_loss_clip": 1.01486695, "balance_loss_mlp": 1.02373004, "epoch": 0.2727791973545769, "flos": 34451349171840.0, "grad_norm": 1.8991001165122825, "language_loss": 0.71817946, "learning_rate": 3.309851253545123e-06, "loss": 0.73928285, "num_input_tokens_seen": 98085450, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.5625, "step": 4537, "time_per_iteration": 2.4885199069976807 }, { "auxiliary_loss_clip": 0.01080491, "auxiliary_loss_mlp": 0.01026808, "balance_loss_clip": 1.01038146, "balance_loss_mlp": 1.02329206, "epoch": 0.27283932060724486, "flos": 15041036626560.0, "grad_norm": 4.403359888192737, "language_loss": 0.78374529, "learning_rate": 3.3095657123577572e-06, "loss": 0.80481833, "num_input_tokens_seen": 98099115, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.5703125, "step": 4538, "time_per_iteration": 2.3258514404296875 }, { "auxiliary_loss_clip": 0.01081866, "auxiliary_loss_mlp": 0.01034917, "balance_loss_clip": 1.01869297, "balance_loss_mlp": 1.02523971, "epoch": 0.2728994438599128, "flos": 21651351672960.0, "grad_norm": 1.466428437806416, "language_loss": 0.90227783, "learning_rate": 3.30928012443536e-06, "loss": 0.9234457, "num_input_tokens_seen": 98118415, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.56640625, "step": 4539, "time_per_iteration": 2.377931833267212 }, { "auxiliary_loss_clip": 0.0107986, "auxiliary_loss_mlp": 0.01029768, "balance_loss_clip": 1.01235127, "balance_loss_mlp": 1.02367878, "epoch": 0.2729595671125808, "flos": 17487610128000.0, "grad_norm": 1.7861445235460929, "language_loss": 0.88047725, "learning_rate": 3.308994489788123e-06, "loss": 0.90157354, "num_input_tokens_seen": 98136300, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.5625, "step": 4540, "time_per_iteration": 2.359163999557495 }, { "auxiliary_loss_clip": 0.01080049, "auxiliary_loss_mlp": 0.0102998, "balance_loss_clip": 1.01464987, "balance_loss_mlp": 1.02274692, "epoch": 0.27301969036524876, "flos": 19317128526720.0, "grad_norm": 1.914978820946021, "language_loss": 0.81810862, "learning_rate": 3.308708808426239e-06, "loss": 0.83920884, "num_input_tokens_seen": 98154580, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.5703125, "step": 4541, "time_per_iteration": 2.3712704181671143 }, { "auxiliary_loss_clip": 0.01080326, "auxiliary_loss_mlp": 0.01030499, "balance_loss_clip": 1.01346445, "balance_loss_mlp": 1.02396894, "epoch": 0.2730798136179167, "flos": 21064706231040.0, "grad_norm": 3.455597355351001, "language_loss": 0.79502952, "learning_rate": 3.308423080359905e-06, "loss": 0.81613779, "num_input_tokens_seen": 98173115, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.5625, "step": 4542, "time_per_iteration": 2.3875794410705566 }, { "auxiliary_loss_clip": 0.01082546, "auxiliary_loss_mlp": 0.01030478, "balance_loss_clip": 1.0145514, "balance_loss_mlp": 1.02544093, "epoch": 0.2731399368705847, "flos": 19170737729280.0, "grad_norm": 2.2151386232309314, "language_loss": 0.89530146, "learning_rate": 3.3081373055993167e-06, "loss": 0.91643178, "num_input_tokens_seen": 98190260, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.5703125, "step": 4543, "time_per_iteration": 2.3860435485839844 }, { "auxiliary_loss_clip": 0.0108106, "auxiliary_loss_mlp": 0.01033381, "balance_loss_clip": 1.01604807, "balance_loss_mlp": 1.02252424, "epoch": 0.27320006012325265, "flos": 18289320744960.0, "grad_norm": 1.7617949379491604, "language_loss": 0.63285214, "learning_rate": 3.3078514841546728e-06, "loss": 0.65399659, "num_input_tokens_seen": 98207115, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.5859375, "step": 4544, "time_per_iteration": 2.3588955402374268 }, { "auxiliary_loss_clip": 0.01082279, "auxiliary_loss_mlp": 0.01034088, "balance_loss_clip": 1.01612341, "balance_loss_mlp": 1.02477002, "epoch": 0.2732601833759206, "flos": 34859483498880.0, "grad_norm": 1.7956993323709336, "language_loss": 0.69581962, "learning_rate": 3.307565616036174e-06, "loss": 0.71698326, "num_input_tokens_seen": 98230610, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.57421875, "step": 4545, "time_per_iteration": 2.503817558288574 }, { "auxiliary_loss_clip": 0.01019235, "auxiliary_loss_mlp": 0.01003612, "balance_loss_clip": 1.00216949, "balance_loss_mlp": 1.00478387, "epoch": 0.2733203066285886, "flos": 53907709800960.0, "grad_norm": 0.7189251829199559, "language_loss": 0.61637843, "learning_rate": 3.3072797012540214e-06, "loss": 0.63660687, "num_input_tokens_seen": 98293585, "router_z_loss_clip": 0.0144043, "router_z_loss_mlp": 0.14453125, "step": 4546, "time_per_iteration": 3.0726189613342285 }, { "auxiliary_loss_clip": 0.010849, "auxiliary_loss_mlp": 0.01036062, "balance_loss_clip": 1.01928902, "balance_loss_mlp": 1.02839208, "epoch": 0.2733804298812566, "flos": 20659539369600.0, "grad_norm": 1.8703870942387204, "language_loss": 0.6477145, "learning_rate": 3.306993739818419e-06, "loss": 0.66892421, "num_input_tokens_seen": 98311680, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.5625, "step": 4547, "time_per_iteration": 2.3983898162841797 }, { "auxiliary_loss_clip": 0.01076253, "auxiliary_loss_mlp": 0.01032038, "balance_loss_clip": 1.01643991, "balance_loss_mlp": 1.0233655, "epoch": 0.27344055313392457, "flos": 25883174102400.0, "grad_norm": 1.9474178713735677, "language_loss": 0.77696288, "learning_rate": 3.3067077317395722e-06, "loss": 0.79804575, "num_input_tokens_seen": 98330770, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.52734375, "step": 4548, "time_per_iteration": 2.4211511611938477 }, { "auxiliary_loss_clip": 0.01080083, "auxiliary_loss_mlp": 0.0103302, "balance_loss_clip": 1.01860762, "balance_loss_mlp": 1.02451921, "epoch": 0.27350067638659253, "flos": 22928649096960.0, "grad_norm": 1.8938285538899577, "language_loss": 0.83013201, "learning_rate": 3.3064216770276874e-06, "loss": 0.85126305, "num_input_tokens_seen": 98349860, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.5546875, "step": 4549, "time_per_iteration": 2.4124035835266113 }, { "auxiliary_loss_clip": 0.01081643, "auxiliary_loss_mlp": 0.01034475, "balance_loss_clip": 1.01699853, "balance_loss_mlp": 1.02370894, "epoch": 0.2735607996392605, "flos": 16574072826240.0, "grad_norm": 2.3222990019488106, "language_loss": 0.71106243, "learning_rate": 3.3061355756929733e-06, "loss": 0.73222363, "num_input_tokens_seen": 98367040, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.578125, "step": 4550, "time_per_iteration": 2.3535232543945312 }, { "auxiliary_loss_clip": 0.01079898, "auxiliary_loss_mlp": 0.01028923, "balance_loss_clip": 1.01433134, "balance_loss_mlp": 1.02467299, "epoch": 0.27362092289192846, "flos": 19644299677440.0, "grad_norm": 5.854621502080208, "language_loss": 0.78400576, "learning_rate": 3.305849427745641e-06, "loss": 0.805094, "num_input_tokens_seen": 98384010, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.5546875, "step": 4551, "time_per_iteration": 2.374575614929199 }, { "auxiliary_loss_clip": 0.01081366, "auxiliary_loss_mlp": 0.01037482, "balance_loss_clip": 1.02162719, "balance_loss_mlp": 1.02400029, "epoch": 0.27368104614459643, "flos": 17638190288640.0, "grad_norm": 2.1163440576704122, "language_loss": 0.70711285, "learning_rate": 3.305563233195901e-06, "loss": 0.72830129, "num_input_tokens_seen": 98399625, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.57421875, "step": 4552, "time_per_iteration": 2.3439207077026367 }, { "auxiliary_loss_clip": 0.01082133, "auxiliary_loss_mlp": 0.01033672, "balance_loss_clip": 1.01633906, "balance_loss_mlp": 1.02481949, "epoch": 0.2737411693972644, "flos": 21578941779840.0, "grad_norm": 1.8348327293437081, "language_loss": 0.71781379, "learning_rate": 3.305276992053968e-06, "loss": 0.73897183, "num_input_tokens_seen": 98417310, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.5703125, "step": 4553, "time_per_iteration": 2.3851184844970703 }, { "auxiliary_loss_clip": 0.01080432, "auxiliary_loss_mlp": 0.01033086, "balance_loss_clip": 1.01634884, "balance_loss_mlp": 1.02447724, "epoch": 0.27380129264993236, "flos": 25482859920000.0, "grad_norm": 1.6884843676486623, "language_loss": 0.59042692, "learning_rate": 3.304990704330057e-06, "loss": 0.61156213, "num_input_tokens_seen": 98438670, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.55859375, "step": 4554, "time_per_iteration": 2.436256170272827 }, { "auxiliary_loss_clip": 0.01084195, "auxiliary_loss_mlp": 0.01030496, "balance_loss_clip": 1.01349711, "balance_loss_mlp": 1.02560186, "epoch": 0.2738614159026003, "flos": 18660202784640.0, "grad_norm": 1.6264472836470485, "language_loss": 0.73733985, "learning_rate": 3.304704370034384e-06, "loss": 0.75848675, "num_input_tokens_seen": 98456060, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.5859375, "step": 4555, "time_per_iteration": 2.3795883655548096 }, { "auxiliary_loss_clip": 0.01082826, "auxiliary_loss_mlp": 0.01031209, "balance_loss_clip": 1.01437676, "balance_loss_mlp": 1.02564645, "epoch": 0.2739215391552683, "flos": 23476017392640.0, "grad_norm": 7.1475439908342855, "language_loss": 0.77335113, "learning_rate": 3.3044179891771684e-06, "loss": 0.79449153, "num_input_tokens_seen": 98473765, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.5703125, "step": 4556, "time_per_iteration": 2.4172799587249756 }, { "auxiliary_loss_clip": 0.0108907, "auxiliary_loss_mlp": 0.01039534, "balance_loss_clip": 1.0221293, "balance_loss_mlp": 1.02616084, "epoch": 0.27398166240793626, "flos": 17127690255360.0, "grad_norm": 2.317930043852284, "language_loss": 0.82052898, "learning_rate": 3.3041315617686298e-06, "loss": 0.84181505, "num_input_tokens_seen": 98490590, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.62890625, "step": 4557, "time_per_iteration": 3.7227389812469482 }, { "auxiliary_loss_clip": 0.01080511, "auxiliary_loss_mlp": 0.01035487, "balance_loss_clip": 1.0191437, "balance_loss_mlp": 1.02469683, "epoch": 0.2740417856606042, "flos": 23403607499520.0, "grad_norm": 1.7172891543294506, "language_loss": 0.72389257, "learning_rate": 3.303845087818991e-06, "loss": 0.74505258, "num_input_tokens_seen": 98510590, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.55859375, "step": 4558, "time_per_iteration": 2.3939926624298096 }, { "auxiliary_loss_clip": 0.01080288, "auxiliary_loss_mlp": 0.01032133, "balance_loss_clip": 1.01655221, "balance_loss_mlp": 1.02412319, "epoch": 0.2741019089132722, "flos": 12779781955200.0, "grad_norm": 2.5956017130927895, "language_loss": 0.68328184, "learning_rate": 3.3035585673384745e-06, "loss": 0.70440602, "num_input_tokens_seen": 98527875, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.5625, "step": 4559, "time_per_iteration": 2.37117862701416 }, { "auxiliary_loss_clip": 0.01079388, "auxiliary_loss_mlp": 0.01034454, "balance_loss_clip": 1.01826513, "balance_loss_mlp": 1.02354228, "epoch": 0.27416203216594015, "flos": 20630491251840.0, "grad_norm": 1.769169887381498, "language_loss": 0.72384578, "learning_rate": 3.3032720003373057e-06, "loss": 0.74498415, "num_input_tokens_seen": 98547575, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.55859375, "step": 4560, "time_per_iteration": 3.7601819038391113 }, { "auxiliary_loss_clip": 0.01081038, "auxiliary_loss_mlp": 0.01035452, "balance_loss_clip": 1.01945376, "balance_loss_mlp": 1.02398014, "epoch": 0.27422215541860817, "flos": 26540379135360.0, "grad_norm": 2.2048692146699955, "language_loss": 0.81552273, "learning_rate": 3.302985386825712e-06, "loss": 0.83668756, "num_input_tokens_seen": 98566290, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5703125, "step": 4561, "time_per_iteration": 2.4507904052734375 }, { "auxiliary_loss_clip": 0.01082992, "auxiliary_loss_mlp": 0.01032897, "balance_loss_clip": 1.0165652, "balance_loss_mlp": 1.0246489, "epoch": 0.27428227867127614, "flos": 23330045531520.0, "grad_norm": 2.4658243450454993, "language_loss": 0.754637, "learning_rate": 3.302698726813921e-06, "loss": 0.77579594, "num_input_tokens_seen": 98586255, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.5859375, "step": 4562, "time_per_iteration": 2.3994991779327393 }, { "auxiliary_loss_clip": 0.01081196, "auxiliary_loss_mlp": 0.01032378, "balance_loss_clip": 1.01691628, "balance_loss_mlp": 1.02525234, "epoch": 0.2743424019239441, "flos": 23034121914240.0, "grad_norm": 1.7169728534065862, "language_loss": 0.74742758, "learning_rate": 3.3024120203121637e-06, "loss": 0.76856327, "num_input_tokens_seen": 98606030, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.5625, "step": 4563, "time_per_iteration": 2.418084144592285 }, { "auxiliary_loss_clip": 0.0108587, "auxiliary_loss_mlp": 0.01046262, "balance_loss_clip": 1.02741539, "balance_loss_mlp": 1.02481794, "epoch": 0.27440252517661207, "flos": 21980024012160.0, "grad_norm": 1.5924422647558862, "language_loss": 0.6261344, "learning_rate": 3.302125267330672e-06, "loss": 0.64745569, "num_input_tokens_seen": 98625225, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.609375, "step": 4564, "time_per_iteration": 3.791637420654297 }, { "auxiliary_loss_clip": 0.01082194, "auxiliary_loss_mlp": 0.01033087, "balance_loss_clip": 1.0160042, "balance_loss_mlp": 1.02397561, "epoch": 0.27446264842928003, "flos": 40185867962880.0, "grad_norm": 1.8725739588961885, "language_loss": 0.78442985, "learning_rate": 3.3018384678796786e-06, "loss": 0.80558264, "num_input_tokens_seen": 98649470, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.58203125, "step": 4565, "time_per_iteration": 2.54919695854187 }, { "auxiliary_loss_clip": 0.01081763, "auxiliary_loss_mlp": 0.01031001, "balance_loss_clip": 1.01489639, "balance_loss_mlp": 1.02574587, "epoch": 0.274522771681948, "flos": 13478847575040.0, "grad_norm": 1.8001092575692066, "language_loss": 0.68084419, "learning_rate": 3.3015516219694186e-06, "loss": 0.70197183, "num_input_tokens_seen": 98666915, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.55859375, "step": 4566, "time_per_iteration": 3.755009412765503 }, { "auxiliary_loss_clip": 0.01080487, "auxiliary_loss_mlp": 0.01035123, "balance_loss_clip": 1.02044201, "balance_loss_mlp": 1.02552259, "epoch": 0.27458289493461596, "flos": 28620853453440.0, "grad_norm": 1.875365240259041, "language_loss": 0.61288488, "learning_rate": 3.3012647296101296e-06, "loss": 0.63404095, "num_input_tokens_seen": 98688240, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.55078125, "step": 4567, "time_per_iteration": 2.4268686771392822 }, { "auxiliary_loss_clip": 0.01082299, "auxiliary_loss_mlp": 0.01036726, "balance_loss_clip": 1.01952434, "balance_loss_mlp": 1.02608514, "epoch": 0.2746430181872839, "flos": 20118804232320.0, "grad_norm": 1.6513416938406327, "language_loss": 0.82164323, "learning_rate": 3.30097779081205e-06, "loss": 0.84283352, "num_input_tokens_seen": 98708245, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.5625, "step": 4568, "time_per_iteration": 2.390467882156372 }, { "auxiliary_loss_clip": 0.01082325, "auxiliary_loss_mlp": 0.01029951, "balance_loss_clip": 1.01339316, "balance_loss_mlp": 1.02532458, "epoch": 0.2747031414399519, "flos": 20192436023040.0, "grad_norm": 1.8725538972554452, "language_loss": 0.68479341, "learning_rate": 3.300690805585419e-06, "loss": 0.70591617, "num_input_tokens_seen": 98724575, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.5703125, "step": 4569, "time_per_iteration": 2.3657889366149902 }, { "auxiliary_loss_clip": 0.01083034, "auxiliary_loss_mlp": 0.0103448, "balance_loss_clip": 1.01734972, "balance_loss_mlp": 1.02505016, "epoch": 0.27476326469261986, "flos": 13515506369280.0, "grad_norm": 2.6925017276632617, "language_loss": 0.70550632, "learning_rate": 3.300403773940479e-06, "loss": 0.72668153, "num_input_tokens_seen": 98740700, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.58203125, "step": 4570, "time_per_iteration": 2.341416597366333 }, { "auxiliary_loss_clip": 0.01017436, "auxiliary_loss_mlp": 0.01010343, "balance_loss_clip": 1.008793, "balance_loss_mlp": 1.00421321, "epoch": 0.2748233879452878, "flos": 65934067282560.0, "grad_norm": 0.7544978614366927, "language_loss": 0.55759937, "learning_rate": 3.3001166958874738e-06, "loss": 0.5778771, "num_input_tokens_seen": 98803030, "router_z_loss_clip": 0.01544189, "router_z_loss_mlp": 0.1328125, "step": 4571, "time_per_iteration": 3.134333372116089 }, { "auxiliary_loss_clip": 0.01085919, "auxiliary_loss_mlp": 0.01034416, "balance_loss_clip": 1.01726198, "balance_loss_mlp": 1.02698457, "epoch": 0.2748835111979558, "flos": 17383254474240.0, "grad_norm": 2.4238304663184067, "language_loss": 0.7786752, "learning_rate": 3.299829571436648e-06, "loss": 0.7998786, "num_input_tokens_seen": 98820505, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.58984375, "step": 4572, "time_per_iteration": 2.3765413761138916 }, { "auxiliary_loss_clip": 0.01076712, "auxiliary_loss_mlp": 0.01028726, "balance_loss_clip": 1.01426625, "balance_loss_mlp": 1.02383137, "epoch": 0.27494363445062375, "flos": 23586412711680.0, "grad_norm": 1.6327844311762805, "language_loss": 0.81324852, "learning_rate": 3.2995424005982475e-06, "loss": 0.8343029, "num_input_tokens_seen": 98842150, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.52734375, "step": 4573, "time_per_iteration": 2.4041430950164795 }, { "auxiliary_loss_clip": 0.01079209, "auxiliary_loss_mlp": 0.01028755, "balance_loss_clip": 1.01344848, "balance_loss_mlp": 1.02476406, "epoch": 0.2750037577032918, "flos": 17163650822400.0, "grad_norm": 2.0728941071216362, "language_loss": 0.7886256, "learning_rate": 3.299255183382522e-06, "loss": 0.80970526, "num_input_tokens_seen": 98861050, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.54296875, "step": 4574, "time_per_iteration": 2.368095636367798 }, { "auxiliary_loss_clip": 0.01082773, "auxiliary_loss_mlp": 0.01035358, "balance_loss_clip": 1.01949668, "balance_loss_mlp": 1.02525806, "epoch": 0.27506388095595974, "flos": 24490942882560.0, "grad_norm": 2.0706887074553095, "language_loss": 0.74229103, "learning_rate": 3.298967919799722e-06, "loss": 0.76347232, "num_input_tokens_seen": 98879695, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.57421875, "step": 4575, "time_per_iteration": 2.3982925415039062 }, { "auxiliary_loss_clip": 0.01081555, "auxiliary_loss_mlp": 0.01030455, "balance_loss_clip": 1.01481473, "balance_loss_mlp": 1.02672315, "epoch": 0.2751240042086277, "flos": 38763157259520.0, "grad_norm": 1.6528136680417422, "language_loss": 0.717233, "learning_rate": 3.2986806098600973e-06, "loss": 0.73835313, "num_input_tokens_seen": 98902035, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.546875, "step": 4576, "time_per_iteration": 2.5271897315979004 }, { "auxiliary_loss_clip": 0.01082386, "auxiliary_loss_mlp": 0.01032288, "balance_loss_clip": 1.01652813, "balance_loss_mlp": 1.02639341, "epoch": 0.27518412746129567, "flos": 26905815002880.0, "grad_norm": 1.6121338137589687, "language_loss": 0.73308748, "learning_rate": 3.298393253573902e-06, "loss": 0.75423419, "num_input_tokens_seen": 98921835, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.5625, "step": 4577, "time_per_iteration": 2.4411284923553467 }, { "auxiliary_loss_clip": 0.01082051, "auxiliary_loss_mlp": 0.01033499, "balance_loss_clip": 1.01599896, "balance_loss_mlp": 1.02485323, "epoch": 0.27524425071396363, "flos": 24899356500480.0, "grad_norm": 2.3698060249909685, "language_loss": 0.76228505, "learning_rate": 3.298105850951392e-06, "loss": 0.78344053, "num_input_tokens_seen": 98939610, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.57421875, "step": 4578, "time_per_iteration": 2.4060704708099365 }, { "auxiliary_loss_clip": 0.01083281, "auxiliary_loss_mlp": 0.01034075, "balance_loss_clip": 1.01386905, "balance_loss_mlp": 1.02491999, "epoch": 0.2753043739666316, "flos": 26286804864000.0, "grad_norm": 1.3655607706506157, "language_loss": 0.65976298, "learning_rate": 3.2978184020028232e-06, "loss": 0.68093652, "num_input_tokens_seen": 98962250, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.5859375, "step": 4579, "time_per_iteration": 2.453016519546509 }, { "auxiliary_loss_clip": 0.0108413, "auxiliary_loss_mlp": 0.01036867, "balance_loss_clip": 1.01864028, "balance_loss_mlp": 1.02587795, "epoch": 0.27536449721929956, "flos": 24205632318720.0, "grad_norm": 1.7724725800381609, "language_loss": 0.79881054, "learning_rate": 3.297530906738454e-06, "loss": 0.8200205, "num_input_tokens_seen": 98981845, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.5859375, "step": 4580, "time_per_iteration": 2.41353440284729 }, { "auxiliary_loss_clip": 0.01081912, "auxiliary_loss_mlp": 0.01036067, "balance_loss_clip": 1.018543, "balance_loss_mlp": 1.02512968, "epoch": 0.27542462047196753, "flos": 19536243419520.0, "grad_norm": 1.5336696001801486, "language_loss": 0.67433882, "learning_rate": 3.297243365168544e-06, "loss": 0.69551861, "num_input_tokens_seen": 99001855, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.56640625, "step": 4581, "time_per_iteration": 2.403851270675659 }, { "auxiliary_loss_clip": 0.01082081, "auxiliary_loss_mlp": 0.01033691, "balance_loss_clip": 1.01818204, "balance_loss_mlp": 1.02589679, "epoch": 0.2754847437246355, "flos": 14318299238400.0, "grad_norm": 1.6953735686706208, "language_loss": 0.78049088, "learning_rate": 3.2969557773033555e-06, "loss": 0.80164862, "num_input_tokens_seen": 99019880, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.5625, "step": 4582, "time_per_iteration": 2.355501890182495 }, { "auxiliary_loss_clip": 0.01081738, "auxiliary_loss_mlp": 0.01031086, "balance_loss_clip": 1.01573801, "balance_loss_mlp": 1.02502, "epoch": 0.27554486697730346, "flos": 18837910938240.0, "grad_norm": 1.6126989313064397, "language_loss": 0.84509051, "learning_rate": 3.296668143153152e-06, "loss": 0.86621881, "num_input_tokens_seen": 99037570, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.56640625, "step": 4583, "time_per_iteration": 2.3900318145751953 }, { "auxiliary_loss_clip": 0.01083269, "auxiliary_loss_mlp": 0.01027861, "balance_loss_clip": 1.01138651, "balance_loss_mlp": 1.02407956, "epoch": 0.2756049902299714, "flos": 22381210978560.0, "grad_norm": 2.3227855332646983, "language_loss": 0.66627789, "learning_rate": 3.296380462728197e-06, "loss": 0.68738925, "num_input_tokens_seen": 99056875, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.59375, "step": 4584, "time_per_iteration": 2.3854594230651855 }, { "auxiliary_loss_clip": 0.01079057, "auxiliary_loss_mlp": 0.01035382, "balance_loss_clip": 1.01950383, "balance_loss_mlp": 1.02439451, "epoch": 0.2756651134826394, "flos": 19572867302400.0, "grad_norm": 2.8000583162095687, "language_loss": 0.77081859, "learning_rate": 3.2960927360387585e-06, "loss": 0.79196298, "num_input_tokens_seen": 99074685, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.546875, "step": 4585, "time_per_iteration": 2.4002299308776855 }, { "auxiliary_loss_clip": 0.01085499, "auxiliary_loss_mlp": 0.01030279, "balance_loss_clip": 1.01256418, "balance_loss_mlp": 1.02719843, "epoch": 0.27572523673530736, "flos": 23585435193600.0, "grad_norm": 1.5410654888681528, "language_loss": 0.71680582, "learning_rate": 3.2958049630951038e-06, "loss": 0.73796368, "num_input_tokens_seen": 99095300, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.5859375, "step": 4586, "time_per_iteration": 2.405465602874756 }, { "auxiliary_loss_clip": 0.0108218, "auxiliary_loss_mlp": 0.01032635, "balance_loss_clip": 1.01612413, "balance_loss_mlp": 1.02582371, "epoch": 0.2757853599879754, "flos": 22819021827840.0, "grad_norm": 1.4985826599983278, "language_loss": 0.80514759, "learning_rate": 3.295517143907504e-06, "loss": 0.82629573, "num_input_tokens_seen": 99115965, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.5625, "step": 4587, "time_per_iteration": 2.390324592590332 }, { "auxiliary_loss_clip": 0.01079336, "auxiliary_loss_mlp": 0.01031305, "balance_loss_clip": 1.01436579, "balance_loss_mlp": 1.02460158, "epoch": 0.27584548324064334, "flos": 18550715160960.0, "grad_norm": 1.9644734697829243, "language_loss": 0.83078039, "learning_rate": 3.2952292784862286e-06, "loss": 0.85188687, "num_input_tokens_seen": 99134265, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.546875, "step": 4588, "time_per_iteration": 2.3554866313934326 }, { "auxiliary_loss_clip": 0.01080198, "auxiliary_loss_mlp": 0.01031529, "balance_loss_clip": 1.01501894, "balance_loss_mlp": 1.02497363, "epoch": 0.2759056064933113, "flos": 23768729164800.0, "grad_norm": 1.5054866330696839, "language_loss": 0.75276405, "learning_rate": 3.2949413668415526e-06, "loss": 0.77388132, "num_input_tokens_seen": 99156185, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.5546875, "step": 4589, "time_per_iteration": 2.415968656539917 }, { "auxiliary_loss_clip": 0.01080159, "auxiliary_loss_mlp": 0.01033699, "balance_loss_clip": 1.01684308, "balance_loss_mlp": 1.02463949, "epoch": 0.27596572974597927, "flos": 24280695475200.0, "grad_norm": 1.6133244888946692, "language_loss": 0.87966681, "learning_rate": 3.29465340898375e-06, "loss": 0.90080535, "num_input_tokens_seen": 99176735, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.5546875, "step": 4590, "time_per_iteration": 2.4039664268493652 }, { "auxiliary_loss_clip": 0.01081942, "auxiliary_loss_mlp": 0.01035953, "balance_loss_clip": 1.01848912, "balance_loss_mlp": 1.0249052, "epoch": 0.27602585299864724, "flos": 35039600536320.0, "grad_norm": 1.5111284181607159, "language_loss": 0.71183312, "learning_rate": 3.2943654049230982e-06, "loss": 0.73301208, "num_input_tokens_seen": 99199765, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.5703125, "step": 4591, "time_per_iteration": 2.498314619064331 }, { "auxiliary_loss_clip": 0.01084116, "auxiliary_loss_mlp": 0.01037484, "balance_loss_clip": 1.0199604, "balance_loss_mlp": 1.02584445, "epoch": 0.2760859762513152, "flos": 24308451872640.0, "grad_norm": 2.921450069477129, "language_loss": 0.79913783, "learning_rate": 3.2940773546698745e-06, "loss": 0.82035381, "num_input_tokens_seen": 99218435, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.58203125, "step": 4592, "time_per_iteration": 2.396172046661377 }, { "auxiliary_loss_clip": 0.01017458, "auxiliary_loss_mlp": 0.01001094, "balance_loss_clip": 0.99965775, "balance_loss_mlp": 1.00372052, "epoch": 0.27614609950398317, "flos": 71257623926400.0, "grad_norm": 0.7090126224094614, "language_loss": 0.6163981, "learning_rate": 3.2937892582343574e-06, "loss": 0.63658363, "num_input_tokens_seen": 99276200, "router_z_loss_clip": 0.01434326, "router_z_loss_mlp": 0.13769531, "step": 4593, "time_per_iteration": 2.9940025806427 }, { "auxiliary_loss_clip": 0.01082116, "auxiliary_loss_mlp": 0.01034019, "balance_loss_clip": 1.01617348, "balance_loss_mlp": 1.02496839, "epoch": 0.27620622275665113, "flos": 29673694546560.0, "grad_norm": 1.8497206346215327, "language_loss": 0.77398038, "learning_rate": 3.2935011156268313e-06, "loss": 0.7951417, "num_input_tokens_seen": 99297625, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.5703125, "step": 4594, "time_per_iteration": 2.437809944152832 }, { "auxiliary_loss_clip": 0.01082449, "auxiliary_loss_mlp": 0.01030825, "balance_loss_clip": 1.01533985, "balance_loss_mlp": 1.02691197, "epoch": 0.2762663460093191, "flos": 15377145085440.0, "grad_norm": 1.4000886716608236, "language_loss": 0.91553831, "learning_rate": 3.293212926857577e-06, "loss": 0.93667102, "num_input_tokens_seen": 99315790, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.5546875, "step": 4595, "time_per_iteration": 2.369683027267456 }, { "auxiliary_loss_clip": 0.01084055, "auxiliary_loss_mlp": 0.01042277, "balance_loss_clip": 1.02439547, "balance_loss_mlp": 1.02569568, "epoch": 0.27632646926198706, "flos": 20703040790400.0, "grad_norm": 2.0167650406613666, "language_loss": 0.69258326, "learning_rate": 3.2929246919368796e-06, "loss": 0.71384656, "num_input_tokens_seen": 99334615, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.5859375, "step": 4596, "time_per_iteration": 3.773935317993164 }, { "auxiliary_loss_clip": 0.01088814, "auxiliary_loss_mlp": 0.01036872, "balance_loss_clip": 1.01926494, "balance_loss_mlp": 1.02745521, "epoch": 0.276386592514655, "flos": 32812107016320.0, "grad_norm": 7.64244018348157, "language_loss": 0.63830537, "learning_rate": 3.2926364108750263e-06, "loss": 0.65956223, "num_input_tokens_seen": 99356685, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.61328125, "step": 4597, "time_per_iteration": 2.4807288646698 }, { "auxiliary_loss_clip": 0.01084371, "auxiliary_loss_mlp": 0.01039202, "balance_loss_clip": 1.02322757, "balance_loss_mlp": 1.02855742, "epoch": 0.276446715767323, "flos": 18550715160960.0, "grad_norm": 1.9211654847174242, "language_loss": 0.86382246, "learning_rate": 3.292348083682304e-06, "loss": 0.88505822, "num_input_tokens_seen": 99374810, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.55859375, "step": 4598, "time_per_iteration": 2.3577988147735596 }, { "auxiliary_loss_clip": 0.01084663, "auxiliary_loss_mlp": 0.01033314, "balance_loss_clip": 1.01526618, "balance_loss_mlp": 1.02585006, "epoch": 0.27650683901999096, "flos": 22818533068800.0, "grad_norm": 3.161751698695146, "language_loss": 0.80081058, "learning_rate": 3.2920597103690035e-06, "loss": 0.82199037, "num_input_tokens_seen": 99391290, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.58984375, "step": 4599, "time_per_iteration": 2.3980748653411865 }, { "auxiliary_loss_clip": 0.01085815, "auxiliary_loss_mlp": 0.01037315, "balance_loss_clip": 1.01992202, "balance_loss_mlp": 1.02654839, "epoch": 0.276566962272659, "flos": 21360455291520.0, "grad_norm": 1.668511014208387, "language_loss": 0.78368044, "learning_rate": 3.2917712909454148e-06, "loss": 0.80491167, "num_input_tokens_seen": 99409120, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.59375, "step": 4600, "time_per_iteration": 3.7771215438842773 }, { "auxiliary_loss_clip": 0.01087953, "auxiliary_loss_mlp": 0.01041896, "balance_loss_clip": 1.02378857, "balance_loss_mlp": 1.02644753, "epoch": 0.27662708552532694, "flos": 17709692486400.0, "grad_norm": 1.6785573351567948, "language_loss": 0.73203355, "learning_rate": 3.291482825421832e-06, "loss": 0.75333202, "num_input_tokens_seen": 99426180, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.61328125, "step": 4601, "time_per_iteration": 2.3417866230010986 }, { "auxiliary_loss_clip": 0.01081014, "auxiliary_loss_mlp": 0.01035789, "balance_loss_clip": 1.01808619, "balance_loss_mlp": 1.02512193, "epoch": 0.2766872087779949, "flos": 21251630983680.0, "grad_norm": 1.4752810883295984, "language_loss": 0.79993773, "learning_rate": 3.2911943138085496e-06, "loss": 0.82110578, "num_input_tokens_seen": 99447720, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.55859375, "step": 4602, "time_per_iteration": 2.441012144088745 }, { "auxiliary_loss_clip": 0.01086108, "auxiliary_loss_mlp": 0.01044316, "balance_loss_clip": 1.02509975, "balance_loss_mlp": 1.02552462, "epoch": 0.2767473320306629, "flos": 12931095254400.0, "grad_norm": 2.008895937106637, "language_loss": 0.77057678, "learning_rate": 3.290905756115863e-06, "loss": 0.79188102, "num_input_tokens_seen": 99464720, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.60546875, "step": 4603, "time_per_iteration": 3.690415143966675 }, { "auxiliary_loss_clip": 0.01081639, "auxiliary_loss_mlp": 0.01035623, "balance_loss_clip": 1.01981592, "balance_loss_mlp": 1.02666855, "epoch": 0.27680745528333084, "flos": 15011953597440.0, "grad_norm": 1.5138019901121496, "language_loss": 0.81632638, "learning_rate": 3.2906171523540706e-06, "loss": 0.83749896, "num_input_tokens_seen": 99482310, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.55078125, "step": 4604, "time_per_iteration": 2.365110397338867 }, { "auxiliary_loss_clip": 0.01083097, "auxiliary_loss_mlp": 0.01030493, "balance_loss_clip": 1.01291013, "balance_loss_mlp": 1.0243206, "epoch": 0.2768675785359988, "flos": 22636740286080.0, "grad_norm": 1.790714838560572, "language_loss": 0.69702685, "learning_rate": 3.2903285025334723e-06, "loss": 0.71816272, "num_input_tokens_seen": 99501255, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.5859375, "step": 4605, "time_per_iteration": 2.386167287826538 }, { "auxiliary_loss_clip": 0.01016614, "auxiliary_loss_mlp": 0.01006461, "balance_loss_clip": 1.00462472, "balance_loss_mlp": 1.00287592, "epoch": 0.27692770178866677, "flos": 66127171345920.0, "grad_norm": 0.7063068457666765, "language_loss": 0.57166123, "learning_rate": 3.290039806664368e-06, "loss": 0.591892, "num_input_tokens_seen": 99568925, "router_z_loss_clip": 0.01831055, "router_z_loss_mlp": 0.13769531, "step": 4606, "time_per_iteration": 4.494596719741821 }, { "auxiliary_loss_clip": 0.01085999, "auxiliary_loss_mlp": 0.01035442, "balance_loss_clip": 1.01853848, "balance_loss_mlp": 1.02714872, "epoch": 0.27698782504133473, "flos": 26463884613120.0, "grad_norm": 1.8228375418366305, "language_loss": 0.691333, "learning_rate": 3.2897510647570626e-06, "loss": 0.71254742, "num_input_tokens_seen": 99588455, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.5859375, "step": 4607, "time_per_iteration": 2.4294941425323486 }, { "auxiliary_loss_clip": 0.01080386, "auxiliary_loss_mlp": 0.01032574, "balance_loss_clip": 1.01674294, "balance_loss_mlp": 1.02615881, "epoch": 0.2770479482940027, "flos": 25883627950080.0, "grad_norm": 1.8111466225444839, "language_loss": 0.69376171, "learning_rate": 3.2894622768218587e-06, "loss": 0.71489131, "num_input_tokens_seen": 99609355, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.54296875, "step": 4608, "time_per_iteration": 2.4185144901275635 }, { "auxiliary_loss_clip": 0.01084521, "auxiliary_loss_mlp": 0.01030639, "balance_loss_clip": 1.01412845, "balance_loss_mlp": 1.02504349, "epoch": 0.27710807154667066, "flos": 22856134469760.0, "grad_norm": 1.7370462806151117, "language_loss": 0.72688055, "learning_rate": 3.289173442869063e-06, "loss": 0.74803215, "num_input_tokens_seen": 99628780, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.59375, "step": 4609, "time_per_iteration": 2.387050151824951 }, { "auxiliary_loss_clip": 0.01082892, "auxiliary_loss_mlp": 0.01038174, "balance_loss_clip": 1.02035236, "balance_loss_mlp": 1.02384937, "epoch": 0.27716819479933863, "flos": 17710146334080.0, "grad_norm": 2.3648961972190876, "language_loss": 0.83644879, "learning_rate": 3.2888845629089833e-06, "loss": 0.85765946, "num_input_tokens_seen": 99644545, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.58984375, "step": 4610, "time_per_iteration": 2.3297476768493652 }, { "auxiliary_loss_clip": 0.01088104, "auxiliary_loss_mlp": 0.01041071, "balance_loss_clip": 1.02184236, "balance_loss_mlp": 1.02676606, "epoch": 0.2772283180520066, "flos": 19645032816000.0, "grad_norm": 2.0264394387545672, "language_loss": 0.68972641, "learning_rate": 3.2885956369519287e-06, "loss": 0.71101815, "num_input_tokens_seen": 99663125, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.61328125, "step": 4611, "time_per_iteration": 2.359511137008667 }, { "auxiliary_loss_clip": 0.01080434, "auxiliary_loss_mlp": 0.01031907, "balance_loss_clip": 1.01537228, "balance_loss_mlp": 1.02680779, "epoch": 0.27728844130467456, "flos": 21031573484160.0, "grad_norm": 2.072752939070887, "language_loss": 0.73418701, "learning_rate": 3.2883066650082106e-06, "loss": 0.75531048, "num_input_tokens_seen": 99682645, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.5390625, "step": 4612, "time_per_iteration": 2.380183696746826 }, { "auxiliary_loss_clip": 0.01084216, "auxiliary_loss_mlp": 0.01039433, "balance_loss_clip": 1.02108717, "balance_loss_mlp": 1.02541828, "epoch": 0.2773485645573425, "flos": 18988211808000.0, "grad_norm": 2.4242154777305873, "language_loss": 0.66694748, "learning_rate": 3.288017647088142e-06, "loss": 0.6881839, "num_input_tokens_seen": 99700520, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.5859375, "step": 4613, "time_per_iteration": 2.379488468170166 }, { "auxiliary_loss_clip": 0.01082104, "auxiliary_loss_mlp": 0.0103418, "balance_loss_clip": 1.01737177, "balance_loss_mlp": 1.02516592, "epoch": 0.27740868781001055, "flos": 21467429297280.0, "grad_norm": 1.653807165990663, "language_loss": 0.7916218, "learning_rate": 3.2877285832020363e-06, "loss": 0.81278467, "num_input_tokens_seen": 99720355, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.5703125, "step": 4614, "time_per_iteration": 2.3730251789093018 }, { "auxiliary_loss_clip": 0.0108533, "auxiliary_loss_mlp": 0.01032106, "balance_loss_clip": 1.01401019, "balance_loss_mlp": 1.02670074, "epoch": 0.2774688110626785, "flos": 19826825598720.0, "grad_norm": 6.762787683563088, "language_loss": 0.80093497, "learning_rate": 3.28743947336021e-06, "loss": 0.82210934, "num_input_tokens_seen": 99736090, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.5859375, "step": 4615, "time_per_iteration": 2.3554940223693848 }, { "auxiliary_loss_clip": 0.01085945, "auxiliary_loss_mlp": 0.01037214, "balance_loss_clip": 1.01949966, "balance_loss_mlp": 1.02532148, "epoch": 0.2775289343153465, "flos": 18215444574720.0, "grad_norm": 2.480429147021416, "language_loss": 0.64004666, "learning_rate": 3.2871503175729807e-06, "loss": 0.66127825, "num_input_tokens_seen": 99751805, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.60546875, "step": 4616, "time_per_iteration": 2.3342959880828857 }, { "auxiliary_loss_clip": 0.01081778, "auxiliary_loss_mlp": 0.01033609, "balance_loss_clip": 1.01616848, "balance_loss_mlp": 1.02426696, "epoch": 0.27758905756801444, "flos": 16471532563200.0, "grad_norm": 2.0166782763790176, "language_loss": 0.82064974, "learning_rate": 3.286861115850667e-06, "loss": 0.84180367, "num_input_tokens_seen": 99770610, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.57421875, "step": 4617, "time_per_iteration": 2.3683419227600098 }, { "auxiliary_loss_clip": 0.01082022, "auxiliary_loss_mlp": 0.01036133, "balance_loss_clip": 1.01902628, "balance_loss_mlp": 1.02380896, "epoch": 0.2776491808206824, "flos": 18727410885120.0, "grad_norm": 2.993876892819792, "language_loss": 0.76747072, "learning_rate": 3.286571868203591e-06, "loss": 0.7886523, "num_input_tokens_seen": 99787305, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.58203125, "step": 4618, "time_per_iteration": 2.369157552719116 }, { "auxiliary_loss_clip": 0.01085982, "auxiliary_loss_mlp": 0.01032293, "balance_loss_clip": 1.01619971, "balance_loss_mlp": 1.02738285, "epoch": 0.27770930407335037, "flos": 28036931097600.0, "grad_norm": 2.1734029909292523, "language_loss": 0.84983563, "learning_rate": 3.286282574642074e-06, "loss": 0.87101841, "num_input_tokens_seen": 99808940, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.5859375, "step": 4619, "time_per_iteration": 2.4373459815979004 }, { "auxiliary_loss_clip": 0.01080802, "auxiliary_loss_mlp": 0.01033186, "balance_loss_clip": 1.0183804, "balance_loss_mlp": 1.02558863, "epoch": 0.27776942732601834, "flos": 23548706576640.0, "grad_norm": 1.916943280515146, "language_loss": 0.76711893, "learning_rate": 3.2859932351764413e-06, "loss": 0.78825879, "num_input_tokens_seen": 99829575, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.5546875, "step": 4620, "time_per_iteration": 2.4141008853912354 }, { "auxiliary_loss_clip": 0.01080938, "auxiliary_loss_mlp": 0.01035295, "balance_loss_clip": 1.0185225, "balance_loss_mlp": 1.02503717, "epoch": 0.2778295505786863, "flos": 23907753665280.0, "grad_norm": 2.5857560160133137, "language_loss": 0.78369844, "learning_rate": 3.2857038498170175e-06, "loss": 0.80486083, "num_input_tokens_seen": 99847575, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.55859375, "step": 4621, "time_per_iteration": 2.418139696121216 }, { "auxiliary_loss_clip": 0.01085422, "auxiliary_loss_mlp": 0.0103963, "balance_loss_clip": 1.02109277, "balance_loss_mlp": 1.02669621, "epoch": 0.27788967383135427, "flos": 25553454422400.0, "grad_norm": 26.338067880850264, "language_loss": 0.87562191, "learning_rate": 3.2854144185741303e-06, "loss": 0.89687252, "num_input_tokens_seen": 99864995, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.5859375, "step": 4622, "time_per_iteration": 2.416123151779175 }, { "auxiliary_loss_clip": 0.01084512, "auxiliary_loss_mlp": 0.01041391, "balance_loss_clip": 1.02359343, "balance_loss_mlp": 1.02586424, "epoch": 0.27794979708402223, "flos": 16251719443200.0, "grad_norm": 2.663149024933452, "language_loss": 0.81161809, "learning_rate": 3.285124941458109e-06, "loss": 0.8328771, "num_input_tokens_seen": 99881540, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.5859375, "step": 4623, "time_per_iteration": 2.3581767082214355 }, { "auxiliary_loss_clip": 0.01085203, "auxiliary_loss_mlp": 0.01034785, "balance_loss_clip": 1.01748776, "balance_loss_mlp": 1.02682495, "epoch": 0.2780099203366902, "flos": 20666591464320.0, "grad_norm": 2.4735003687486343, "language_loss": 0.81395102, "learning_rate": 3.2848354184792845e-06, "loss": 0.8351509, "num_input_tokens_seen": 99899595, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.58203125, "step": 4624, "time_per_iteration": 2.3813822269439697 }, { "auxiliary_loss_clip": 0.01082105, "auxiliary_loss_mlp": 0.010371, "balance_loss_clip": 1.01976693, "balance_loss_mlp": 1.02624857, "epoch": 0.27807004358935816, "flos": 17738880249600.0, "grad_norm": 2.8422111400850256, "language_loss": 0.7708801, "learning_rate": 3.284545849647989e-06, "loss": 0.79207218, "num_input_tokens_seen": 99913020, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.5546875, "step": 4625, "time_per_iteration": 2.3180911540985107 }, { "auxiliary_loss_clip": 0.01082543, "auxiliary_loss_mlp": 0.01035924, "balance_loss_clip": 1.01828074, "balance_loss_mlp": 1.02491689, "epoch": 0.2781301668420261, "flos": 16726189086720.0, "grad_norm": 1.9770042224717426, "language_loss": 0.69873786, "learning_rate": 3.284256234974556e-06, "loss": 0.71992254, "num_input_tokens_seen": 99931405, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.578125, "step": 4626, "time_per_iteration": 2.353443145751953 }, { "auxiliary_loss_clip": 0.0108804, "auxiliary_loss_mlp": 0.01038142, "balance_loss_clip": 1.01935506, "balance_loss_mlp": 1.02624512, "epoch": 0.27819029009469415, "flos": 13843899417600.0, "grad_norm": 2.3771990740611533, "language_loss": 0.92214501, "learning_rate": 3.2839665744693222e-06, "loss": 0.94340682, "num_input_tokens_seen": 99948100, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.6171875, "step": 4627, "time_per_iteration": 2.3378424644470215 }, { "auxiliary_loss_clip": 0.01085744, "auxiliary_loss_mlp": 0.01034219, "balance_loss_clip": 1.01732719, "balance_loss_mlp": 1.02700353, "epoch": 0.2782504133473621, "flos": 27088061633280.0, "grad_norm": 1.7595041837186949, "language_loss": 0.85137182, "learning_rate": 3.2836768681426234e-06, "loss": 0.87257147, "num_input_tokens_seen": 99966470, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.5859375, "step": 4628, "time_per_iteration": 2.420797348022461 }, { "auxiliary_loss_clip": 0.01078953, "auxiliary_loss_mlp": 0.01035346, "balance_loss_clip": 1.01926446, "balance_loss_mlp": 1.02317333, "epoch": 0.2783105366000301, "flos": 21067778430720.0, "grad_norm": 1.535526018524169, "language_loss": 0.79222208, "learning_rate": 3.2833871160047998e-06, "loss": 0.81336504, "num_input_tokens_seen": 99985930, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5546875, "step": 4629, "time_per_iteration": 2.3647055625915527 }, { "auxiliary_loss_clip": 0.01079858, "auxiliary_loss_mlp": 0.01029621, "balance_loss_clip": 1.01469588, "balance_loss_mlp": 1.02572298, "epoch": 0.27837065985269804, "flos": 26500717964160.0, "grad_norm": 1.469135987762367, "language_loss": 0.84519124, "learning_rate": 3.2830973180661907e-06, "loss": 0.86628604, "num_input_tokens_seen": 100006235, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.54296875, "step": 4630, "time_per_iteration": 2.460904121398926 }, { "auxiliary_loss_clip": 0.01082379, "auxiliary_loss_mlp": 0.01029705, "balance_loss_clip": 1.01239622, "balance_loss_mlp": 1.02568364, "epoch": 0.278430783105366, "flos": 20222356924800.0, "grad_norm": 1.8684967864062993, "language_loss": 0.80862486, "learning_rate": 3.2828074743371394e-06, "loss": 0.82974571, "num_input_tokens_seen": 100023655, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.56640625, "step": 4631, "time_per_iteration": 2.3690006732940674 }, { "auxiliary_loss_clip": 0.01088817, "auxiliary_loss_mlp": 0.0103669, "balance_loss_clip": 1.01936841, "balance_loss_mlp": 1.02829027, "epoch": 0.278490906358034, "flos": 25591719139200.0, "grad_norm": 1.7264174442040423, "language_loss": 0.70928347, "learning_rate": 3.2825175848279884e-06, "loss": 0.73053855, "num_input_tokens_seen": 100043280, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.60546875, "step": 4632, "time_per_iteration": 2.4467008113861084 }, { "auxiliary_loss_clip": 0.01081805, "auxiliary_loss_mlp": 0.01033646, "balance_loss_clip": 1.01754093, "balance_loss_mlp": 1.02730489, "epoch": 0.27855102961070194, "flos": 16170861000960.0, "grad_norm": 1.83610757588639, "language_loss": 0.81952357, "learning_rate": 3.2822276495490844e-06, "loss": 0.8406781, "num_input_tokens_seen": 100057690, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.546875, "step": 4633, "time_per_iteration": 2.3516693115234375 }, { "auxiliary_loss_clip": 0.01083561, "auxiliary_loss_mlp": 0.01031399, "balance_loss_clip": 1.01302898, "balance_loss_mlp": 1.02532363, "epoch": 0.2786111528633699, "flos": 22926554415360.0, "grad_norm": 1.6017873903105209, "language_loss": 0.87525678, "learning_rate": 3.2819376685107733e-06, "loss": 0.89640629, "num_input_tokens_seen": 100075875, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.58203125, "step": 4634, "time_per_iteration": 2.414952039718628 }, { "auxiliary_loss_clip": 0.010871, "auxiliary_loss_mlp": 0.01040336, "balance_loss_clip": 1.02274013, "balance_loss_mlp": 1.02746367, "epoch": 0.27867127611603787, "flos": 23403083829120.0, "grad_norm": 1.561803190148723, "language_loss": 0.76841503, "learning_rate": 3.281647641723405e-06, "loss": 0.78968936, "num_input_tokens_seen": 100092930, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.59375, "step": 4635, "time_per_iteration": 3.816922903060913 }, { "auxiliary_loss_clip": 0.01083017, "auxiliary_loss_mlp": 0.01034938, "balance_loss_clip": 1.01747417, "balance_loss_mlp": 1.02431464, "epoch": 0.27873139936870583, "flos": 19827977673600.0, "grad_norm": 1.6173373765387882, "language_loss": 0.646577, "learning_rate": 3.2813575691973288e-06, "loss": 0.66775656, "num_input_tokens_seen": 100110790, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.5859375, "step": 4636, "time_per_iteration": 2.3943421840667725 }, { "auxiliary_loss_clip": 0.01086065, "auxiliary_loss_mlp": 0.01036166, "balance_loss_clip": 1.01816511, "balance_loss_mlp": 1.02606583, "epoch": 0.2787915226213738, "flos": 17706829754880.0, "grad_norm": 4.326128545341542, "language_loss": 0.83744383, "learning_rate": 3.2810674509428973e-06, "loss": 0.85866612, "num_input_tokens_seen": 100126970, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.6015625, "step": 4637, "time_per_iteration": 2.3751144409179688 }, { "auxiliary_loss_clip": 0.01080055, "auxiliary_loss_mlp": 0.01039026, "balance_loss_clip": 1.02281356, "balance_loss_mlp": 1.02487409, "epoch": 0.27885164587404176, "flos": 22089476724480.0, "grad_norm": 1.4454892374667667, "language_loss": 0.75655055, "learning_rate": 3.2807772869704634e-06, "loss": 0.77774143, "num_input_tokens_seen": 100146720, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.55078125, "step": 4638, "time_per_iteration": 2.3966667652130127 }, { "auxiliary_loss_clip": 0.01084943, "auxiliary_loss_mlp": 0.01038278, "balance_loss_clip": 1.02051592, "balance_loss_mlp": 1.02643597, "epoch": 0.27891176912670973, "flos": 19206698296320.0, "grad_norm": 1.7643012345139144, "language_loss": 0.71589458, "learning_rate": 3.2804870772903826e-06, "loss": 0.73712677, "num_input_tokens_seen": 100165920, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.5859375, "step": 4639, "time_per_iteration": 3.7666375637054443 }, { "auxiliary_loss_clip": 0.01082703, "auxiliary_loss_mlp": 0.01032289, "balance_loss_clip": 1.0154686, "balance_loss_mlp": 1.02486873, "epoch": 0.27897189237937775, "flos": 27598771134720.0, "grad_norm": 1.6964359593390061, "language_loss": 0.65952086, "learning_rate": 3.2801968219130123e-06, "loss": 0.68067074, "num_input_tokens_seen": 100185525, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.578125, "step": 4640, "time_per_iteration": 2.4182565212249756 }, { "auxiliary_loss_clip": 0.01085352, "auxiliary_loss_mlp": 0.01034031, "balance_loss_clip": 1.01653075, "balance_loss_mlp": 1.02631187, "epoch": 0.2790320156320457, "flos": 21177161320320.0, "grad_norm": 1.7423823552122508, "language_loss": 0.71854401, "learning_rate": 3.27990652084871e-06, "loss": 0.73973787, "num_input_tokens_seen": 100204850, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.58984375, "step": 4641, "time_per_iteration": 2.3853704929351807 }, { "auxiliary_loss_clip": 0.01084863, "auxiliary_loss_mlp": 0.01036614, "balance_loss_clip": 1.01801729, "balance_loss_mlp": 1.02554667, "epoch": 0.2790921388847137, "flos": 22782816881280.0, "grad_norm": 2.0332748604370914, "language_loss": 0.74730569, "learning_rate": 3.279616174107837e-06, "loss": 0.76852047, "num_input_tokens_seen": 100224520, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.59375, "step": 4642, "time_per_iteration": 2.3755061626434326 }, { "auxiliary_loss_clip": 0.01086728, "auxiliary_loss_mlp": 0.01034502, "balance_loss_clip": 1.01653767, "balance_loss_mlp": 1.02767062, "epoch": 0.27915226213738165, "flos": 23399627604480.0, "grad_norm": 1.8367534855506094, "language_loss": 0.8581357, "learning_rate": 3.2793257817007537e-06, "loss": 0.87934798, "num_input_tokens_seen": 100243935, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.59375, "step": 4643, "time_per_iteration": 3.7561748027801514 }, { "auxiliary_loss_clip": 0.01085337, "auxiliary_loss_mlp": 0.01034537, "balance_loss_clip": 1.01799083, "balance_loss_mlp": 1.02705395, "epoch": 0.2792123853900496, "flos": 22746681757440.0, "grad_norm": 1.6889448295685452, "language_loss": 0.83135402, "learning_rate": 3.279035343637824e-06, "loss": 0.85255277, "num_input_tokens_seen": 100262290, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.58203125, "step": 4644, "time_per_iteration": 2.422675132751465 }, { "auxiliary_loss_clip": 0.01085248, "auxiliary_loss_mlp": 0.01036146, "balance_loss_clip": 1.01912344, "balance_loss_mlp": 1.02601528, "epoch": 0.2792725086427176, "flos": 15048472746240.0, "grad_norm": 2.105528629850968, "language_loss": 0.78621614, "learning_rate": 3.2787448599294135e-06, "loss": 0.80743003, "num_input_tokens_seen": 100280015, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.59375, "step": 4645, "time_per_iteration": 3.7727739810943604 }, { "auxiliary_loss_clip": 0.01021076, "auxiliary_loss_mlp": 0.01004018, "balance_loss_clip": 1.00238454, "balance_loss_mlp": 1.00637841, "epoch": 0.27933263189538554, "flos": 62541871073280.0, "grad_norm": 0.7735840675798167, "language_loss": 0.62300205, "learning_rate": 3.2784543305858878e-06, "loss": 0.64325297, "num_input_tokens_seen": 100338935, "router_z_loss_clip": 0.01635742, "router_z_loss_mlp": 0.14648438, "step": 4646, "time_per_iteration": 3.005046844482422 }, { "auxiliary_loss_clip": 0.01079181, "auxiliary_loss_mlp": 0.01028121, "balance_loss_clip": 1.01285076, "balance_loss_mlp": 1.02494645, "epoch": 0.2793927551480535, "flos": 25117214584320.0, "grad_norm": 1.5647752319544206, "language_loss": 0.89292717, "learning_rate": 3.2781637556176155e-06, "loss": 0.91400021, "num_input_tokens_seen": 100359905, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.54296875, "step": 4647, "time_per_iteration": 2.407451629638672 }, { "auxiliary_loss_clip": 0.01084096, "auxiliary_loss_mlp": 0.01035985, "balance_loss_clip": 1.01748323, "balance_loss_mlp": 1.02522612, "epoch": 0.27945287840072147, "flos": 21323517206400.0, "grad_norm": 1.5465899249731474, "language_loss": 0.87006688, "learning_rate": 3.2778731350349673e-06, "loss": 0.89126772, "num_input_tokens_seen": 100376955, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.5859375, "step": 4648, "time_per_iteration": 2.3792061805725098 }, { "auxiliary_loss_clip": 0.01084972, "auxiliary_loss_mlp": 0.01033372, "balance_loss_clip": 1.01506186, "balance_loss_mlp": 1.02492583, "epoch": 0.27951300165338944, "flos": 27449412871680.0, "grad_norm": 2.5723668293156186, "language_loss": 0.73237026, "learning_rate": 3.2775824688483138e-06, "loss": 0.75355369, "num_input_tokens_seen": 100397545, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.6015625, "step": 4649, "time_per_iteration": 2.4175124168395996 }, { "auxiliary_loss_clip": 0.01082694, "auxiliary_loss_mlp": 0.01034673, "balance_loss_clip": 1.01552844, "balance_loss_mlp": 1.02423, "epoch": 0.2795731249060574, "flos": 15158100015360.0, "grad_norm": 3.53622186829596, "language_loss": 0.79836136, "learning_rate": 3.2772917570680278e-06, "loss": 0.81953508, "num_input_tokens_seen": 100415080, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.5859375, "step": 4650, "time_per_iteration": 2.35884165763855 }, { "auxiliary_loss_clip": 0.01020244, "auxiliary_loss_mlp": 0.01001616, "balance_loss_clip": 0.99998331, "balance_loss_mlp": 1.00546682, "epoch": 0.27963324815872537, "flos": 60116628412800.0, "grad_norm": 0.8185614181317514, "language_loss": 0.58871585, "learning_rate": 3.2770009997044846e-06, "loss": 0.6089344, "num_input_tokens_seen": 100471105, "router_z_loss_clip": 0.01635742, "router_z_loss_mlp": 0.14746094, "step": 4651, "time_per_iteration": 3.0610158443450928 }, { "auxiliary_loss_clip": 0.01085743, "auxiliary_loss_mlp": 0.01038878, "balance_loss_clip": 1.01916051, "balance_loss_mlp": 1.02432191, "epoch": 0.27969337141139333, "flos": 21764784280320.0, "grad_norm": 1.580454444405715, "language_loss": 0.73771393, "learning_rate": 3.2767101967680607e-06, "loss": 0.75896013, "num_input_tokens_seen": 100492520, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.61328125, "step": 4652, "time_per_iteration": 2.41622257232666 }, { "auxiliary_loss_clip": 0.01087177, "auxiliary_loss_mlp": 0.01033025, "balance_loss_clip": 1.01547742, "balance_loss_mlp": 1.02712393, "epoch": 0.27975349466406135, "flos": 39850038794880.0, "grad_norm": 2.315524986794949, "language_loss": 0.79685175, "learning_rate": 3.276419348269134e-06, "loss": 0.81805372, "num_input_tokens_seen": 100512870, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.6015625, "step": 4653, "time_per_iteration": 2.511756420135498 }, { "auxiliary_loss_clip": 0.0108185, "auxiliary_loss_mlp": 0.01034671, "balance_loss_clip": 1.01760054, "balance_loss_mlp": 1.02443719, "epoch": 0.2798136179167293, "flos": 21578732311680.0, "grad_norm": 2.169439957846448, "language_loss": 0.78860891, "learning_rate": 3.2761284542180842e-06, "loss": 0.80977416, "num_input_tokens_seen": 100531655, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.57421875, "step": 4654, "time_per_iteration": 2.377237558364868 }, { "auxiliary_loss_clip": 0.01086783, "auxiliary_loss_mlp": 0.01041431, "balance_loss_clip": 1.02271509, "balance_loss_mlp": 1.02596807, "epoch": 0.2798737411693973, "flos": 21536766990720.0, "grad_norm": 2.0206426507677584, "language_loss": 0.81036353, "learning_rate": 3.2758375146252924e-06, "loss": 0.83164567, "num_input_tokens_seen": 100548005, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.609375, "step": 4655, "time_per_iteration": 2.3665342330932617 }, { "auxiliary_loss_clip": 0.01083031, "auxiliary_loss_mlp": 0.01037939, "balance_loss_clip": 1.02021265, "balance_loss_mlp": 1.02424467, "epoch": 0.27993386442206525, "flos": 26979795907200.0, "grad_norm": 1.612782826834512, "language_loss": 0.8088764, "learning_rate": 3.275546529501142e-06, "loss": 0.83008605, "num_input_tokens_seen": 100567980, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.58984375, "step": 4656, "time_per_iteration": 2.428345203399658 }, { "auxiliary_loss_clip": 0.01083125, "auxiliary_loss_mlp": 0.01031622, "balance_loss_clip": 1.0142529, "balance_loss_mlp": 1.0257895, "epoch": 0.2799939876747332, "flos": 24348811271040.0, "grad_norm": 1.5835376969658335, "language_loss": 0.83297378, "learning_rate": 3.2752554988560165e-06, "loss": 0.85412127, "num_input_tokens_seen": 100588630, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.57421875, "step": 4657, "time_per_iteration": 2.4023072719573975 }, { "auxiliary_loss_clip": 0.0108508, "auxiliary_loss_mlp": 0.01037186, "balance_loss_clip": 1.01986456, "balance_loss_mlp": 1.0267309, "epoch": 0.2800541109274012, "flos": 33655573486080.0, "grad_norm": 1.9253757388017114, "language_loss": 0.63597363, "learning_rate": 3.274964422700303e-06, "loss": 0.65719628, "num_input_tokens_seen": 100608775, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.5859375, "step": 4658, "time_per_iteration": 2.4912872314453125 }, { "auxiliary_loss_clip": 0.01084028, "auxiliary_loss_mlp": 0.01037027, "balance_loss_clip": 1.01922905, "balance_loss_mlp": 1.02554047, "epoch": 0.28011423418006914, "flos": 21613401158400.0, "grad_norm": 3.6337546554885405, "language_loss": 0.78746295, "learning_rate": 3.274673301044388e-06, "loss": 0.8086735, "num_input_tokens_seen": 100627975, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.5859375, "step": 4659, "time_per_iteration": 2.400369644165039 }, { "auxiliary_loss_clip": 0.01087056, "auxiliary_loss_mlp": 0.01041065, "balance_loss_clip": 1.02405405, "balance_loss_mlp": 1.027619, "epoch": 0.2801743574327371, "flos": 23111314663680.0, "grad_norm": 1.7404912596301556, "language_loss": 0.78804803, "learning_rate": 3.274382133898663e-06, "loss": 0.80932927, "num_input_tokens_seen": 100645430, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.59375, "step": 4660, "time_per_iteration": 2.39731764793396 }, { "auxiliary_loss_clip": 0.01081603, "auxiliary_loss_mlp": 0.01032864, "balance_loss_clip": 1.01719964, "balance_loss_mlp": 1.02556157, "epoch": 0.2802344806854051, "flos": 12640582897920.0, "grad_norm": 1.7660425944191298, "language_loss": 0.80277127, "learning_rate": 3.2740909212735172e-06, "loss": 0.82391596, "num_input_tokens_seen": 100663775, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.5625, "step": 4661, "time_per_iteration": 2.3529486656188965 }, { "auxiliary_loss_clip": 0.01085122, "auxiliary_loss_mlp": 0.01045281, "balance_loss_clip": 1.02856791, "balance_loss_mlp": 1.02765727, "epoch": 0.28029460393807304, "flos": 37266395829120.0, "grad_norm": 1.504622092235081, "language_loss": 0.78868937, "learning_rate": 3.273799663179343e-06, "loss": 0.80999339, "num_input_tokens_seen": 100686085, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.57421875, "step": 4662, "time_per_iteration": 2.497027635574341 }, { "auxiliary_loss_clip": 0.01086981, "auxiliary_loss_mlp": 0.01036987, "balance_loss_clip": 1.01740074, "balance_loss_mlp": 1.02619553, "epoch": 0.280354727190741, "flos": 20740048698240.0, "grad_norm": 1.933075453578351, "language_loss": 0.69929224, "learning_rate": 3.273508359626536e-06, "loss": 0.72053194, "num_input_tokens_seen": 100705135, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.609375, "step": 4663, "time_per_iteration": 2.3991286754608154 }, { "auxiliary_loss_clip": 0.01086619, "auxiliary_loss_mlp": 0.01037239, "balance_loss_clip": 1.01793909, "balance_loss_mlp": 1.02787316, "epoch": 0.28041485044340897, "flos": 21469942915200.0, "grad_norm": 1.6952368274520868, "language_loss": 0.77922308, "learning_rate": 3.2732170106254923e-06, "loss": 0.80046165, "num_input_tokens_seen": 100724960, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.58984375, "step": 4664, "time_per_iteration": 2.4059791564941406 }, { "auxiliary_loss_clip": 0.01079312, "auxiliary_loss_mlp": 0.01035245, "balance_loss_clip": 1.01836514, "balance_loss_mlp": 1.02439642, "epoch": 0.28047497369607693, "flos": 14793362375040.0, "grad_norm": 1.9302255295511632, "language_loss": 0.79399538, "learning_rate": 3.272925616186607e-06, "loss": 0.81514096, "num_input_tokens_seen": 100741995, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.546875, "step": 4665, "time_per_iteration": 2.3554060459136963 }, { "auxiliary_loss_clip": 0.01080987, "auxiliary_loss_mlp": 0.01030825, "balance_loss_clip": 1.01413536, "balance_loss_mlp": 1.0249579, "epoch": 0.2805350969487449, "flos": 23069768279040.0, "grad_norm": 1.7143730966235324, "language_loss": 0.80576044, "learning_rate": 3.2726341763202823e-06, "loss": 0.82687855, "num_input_tokens_seen": 100758985, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.5625, "step": 4666, "time_per_iteration": 2.378121852874756 }, { "auxiliary_loss_clip": 0.01085558, "auxiliary_loss_mlp": 0.01040671, "balance_loss_clip": 1.02313566, "balance_loss_mlp": 1.02600992, "epoch": 0.2805952202014129, "flos": 20478968484480.0, "grad_norm": 1.9741226959435492, "language_loss": 0.8464638, "learning_rate": 3.2723426910369166e-06, "loss": 0.86772609, "num_input_tokens_seen": 100777820, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.59375, "step": 4667, "time_per_iteration": 2.4156687259674072 }, { "auxiliary_loss_clip": 0.01085877, "auxiliary_loss_mlp": 0.01037155, "balance_loss_clip": 1.0192256, "balance_loss_mlp": 1.02700794, "epoch": 0.2806553434540809, "flos": 27416105568000.0, "grad_norm": 1.6805529541946873, "language_loss": 0.79382908, "learning_rate": 3.2720511603469136e-06, "loss": 0.81505942, "num_input_tokens_seen": 100798205, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.58984375, "step": 4668, "time_per_iteration": 2.422001838684082 }, { "auxiliary_loss_clip": 0.01082204, "auxiliary_loss_mlp": 0.01035529, "balance_loss_clip": 1.01775444, "balance_loss_mlp": 1.02319944, "epoch": 0.28071546670674885, "flos": 26503825075200.0, "grad_norm": 1.4097722041710656, "language_loss": 0.76093447, "learning_rate": 3.2717595842606766e-06, "loss": 0.78211176, "num_input_tokens_seen": 100819800, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.58984375, "step": 4669, "time_per_iteration": 2.439584970474243 }, { "auxiliary_loss_clip": 0.01082773, "auxiliary_loss_mlp": 0.01036039, "balance_loss_clip": 1.01906371, "balance_loss_mlp": 1.02528119, "epoch": 0.2807755899594168, "flos": 20557627511040.0, "grad_norm": 1.9977654540266632, "language_loss": 0.78902614, "learning_rate": 3.271467962788611e-06, "loss": 0.81021422, "num_input_tokens_seen": 100837880, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.57421875, "step": 4670, "time_per_iteration": 2.372771978378296 }, { "auxiliary_loss_clip": 0.01086034, "auxiliary_loss_mlp": 0.01040292, "balance_loss_clip": 1.02133775, "balance_loss_mlp": 1.02653217, "epoch": 0.2808357132120848, "flos": 24312257210880.0, "grad_norm": 1.7770239131978183, "language_loss": 0.79119527, "learning_rate": 3.271176295941125e-06, "loss": 0.81245852, "num_input_tokens_seen": 100856350, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.59375, "step": 4671, "time_per_iteration": 2.4365363121032715 }, { "auxiliary_loss_clip": 0.01080077, "auxiliary_loss_mlp": 0.01030405, "balance_loss_clip": 1.01521778, "balance_loss_mlp": 1.02597547, "epoch": 0.28089583646475275, "flos": 26431205713920.0, "grad_norm": 1.897473217997013, "language_loss": 0.75169575, "learning_rate": 3.270884583728626e-06, "loss": 0.77280051, "num_input_tokens_seen": 100876135, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.54296875, "step": 4672, "time_per_iteration": 2.426450252532959 }, { "auxiliary_loss_clip": 0.0108321, "auxiliary_loss_mlp": 0.01037734, "balance_loss_clip": 1.01870811, "balance_loss_mlp": 1.02466989, "epoch": 0.2809559597174207, "flos": 23110721170560.0, "grad_norm": 2.6686100036483578, "language_loss": 0.75073838, "learning_rate": 3.2705928261615263e-06, "loss": 0.77194774, "num_input_tokens_seen": 100894790, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.5859375, "step": 4673, "time_per_iteration": 2.4145619869232178 }, { "auxiliary_loss_clip": 0.01082577, "auxiliary_loss_mlp": 0.01034431, "balance_loss_clip": 1.01656151, "balance_loss_mlp": 1.02480292, "epoch": 0.2810160829700887, "flos": 20922434974080.0, "grad_norm": 4.5962322931246815, "language_loss": 0.72032297, "learning_rate": 3.270301023250237e-06, "loss": 0.74149305, "num_input_tokens_seen": 100915100, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.578125, "step": 4674, "time_per_iteration": 2.38694429397583 }, { "auxiliary_loss_clip": 0.01085738, "auxiliary_loss_mlp": 0.01035036, "balance_loss_clip": 1.0167371, "balance_loss_mlp": 1.02691054, "epoch": 0.28107620622275664, "flos": 14355027855360.0, "grad_norm": 2.0331109639680824, "language_loss": 0.76845366, "learning_rate": 3.270009175005171e-06, "loss": 0.78966129, "num_input_tokens_seen": 100932795, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.58984375, "step": 4675, "time_per_iteration": 3.7418601512908936 }, { "auxiliary_loss_clip": 0.01084226, "auxiliary_loss_mlp": 0.01035303, "balance_loss_clip": 1.01752877, "balance_loss_mlp": 1.02490258, "epoch": 0.2811363294754246, "flos": 20918140876800.0, "grad_norm": 2.2300754752588623, "language_loss": 0.70094705, "learning_rate": 3.2697172814367447e-06, "loss": 0.72214234, "num_input_tokens_seen": 100950505, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.59375, "step": 4676, "time_per_iteration": 2.360639810562134 }, { "auxiliary_loss_clip": 0.01077331, "auxiliary_loss_mlp": 0.0103494, "balance_loss_clip": 1.01862073, "balance_loss_mlp": 1.02451169, "epoch": 0.28119645272809257, "flos": 20593797546240.0, "grad_norm": 1.5952549700482799, "language_loss": 0.70463085, "learning_rate": 3.269425342555375e-06, "loss": 0.72575355, "num_input_tokens_seen": 100968790, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.52734375, "step": 4677, "time_per_iteration": 2.3933629989624023 }, { "auxiliary_loss_clip": 0.01084524, "auxiliary_loss_mlp": 0.01034786, "balance_loss_clip": 1.01754856, "balance_loss_mlp": 1.02574372, "epoch": 0.28125657598076054, "flos": 25336259654400.0, "grad_norm": 1.5735103055016368, "language_loss": 0.63790667, "learning_rate": 3.26913335837148e-06, "loss": 0.65909982, "num_input_tokens_seen": 100990205, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.5859375, "step": 4678, "time_per_iteration": 3.7785065174102783 }, { "auxiliary_loss_clip": 0.01080307, "auxiliary_loss_mlp": 0.0103161, "balance_loss_clip": 1.01508796, "balance_loss_mlp": 1.02366781, "epoch": 0.2813166992334285, "flos": 24825934177920.0, "grad_norm": 9.072963622735532, "language_loss": 0.70800877, "learning_rate": 3.26884132889548e-06, "loss": 0.72912794, "num_input_tokens_seen": 101009815, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.56640625, "step": 4679, "time_per_iteration": 2.4279818534851074 }, { "auxiliary_loss_clip": 0.01081878, "auxiliary_loss_mlp": 0.01036609, "balance_loss_clip": 1.0186677, "balance_loss_mlp": 1.02452493, "epoch": 0.2813768224860965, "flos": 21759722133120.0, "grad_norm": 1.8891183685317214, "language_loss": 0.7466377, "learning_rate": 3.268549254137797e-06, "loss": 0.7678225, "num_input_tokens_seen": 101026780, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.5703125, "step": 4680, "time_per_iteration": 2.3850209712982178 }, { "auxiliary_loss_clip": 0.01084906, "auxiliary_loss_mlp": 0.01031721, "balance_loss_clip": 1.01566339, "balance_loss_mlp": 1.02604866, "epoch": 0.2814369457387645, "flos": 24315643612800.0, "grad_norm": 1.6765047915602025, "language_loss": 0.77050579, "learning_rate": 3.2682571341088537e-06, "loss": 0.79167211, "num_input_tokens_seen": 101046215, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5859375, "step": 4681, "time_per_iteration": 2.4106409549713135 }, { "auxiliary_loss_clip": 0.01083417, "auxiliary_loss_mlp": 0.01029796, "balance_loss_clip": 1.01258218, "balance_loss_mlp": 1.02506196, "epoch": 0.28149706899143245, "flos": 18514335657600.0, "grad_norm": 1.9595536419474688, "language_loss": 0.73747474, "learning_rate": 3.2679649688190765e-06, "loss": 0.75860685, "num_input_tokens_seen": 101063365, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.58203125, "step": 4682, "time_per_iteration": 3.7066774368286133 }, { "auxiliary_loss_clip": 0.01081102, "auxiliary_loss_mlp": 0.0103029, "balance_loss_clip": 1.01382756, "balance_loss_mlp": 1.02451968, "epoch": 0.2815571922441004, "flos": 24862104213120.0, "grad_norm": 1.6084029234979913, "language_loss": 0.80664718, "learning_rate": 3.2676727582788904e-06, "loss": 0.82776105, "num_input_tokens_seen": 101083835, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.56640625, "step": 4683, "time_per_iteration": 2.3952901363372803 }, { "auxiliary_loss_clip": 0.01084205, "auxiliary_loss_mlp": 0.01032073, "balance_loss_clip": 1.01508546, "balance_loss_mlp": 1.02594912, "epoch": 0.2816173154967684, "flos": 19900597034880.0, "grad_norm": 1.6779577758032087, "language_loss": 0.76085913, "learning_rate": 3.2673805024987246e-06, "loss": 0.78202188, "num_input_tokens_seen": 101101740, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.58203125, "step": 4684, "time_per_iteration": 2.3700644969940186 }, { "auxiliary_loss_clip": 0.01080426, "auxiliary_loss_mlp": 0.01034152, "balance_loss_clip": 1.01786816, "balance_loss_mlp": 1.02411342, "epoch": 0.28167743874943635, "flos": 17490437948160.0, "grad_norm": 1.8118662902140592, "language_loss": 0.76061994, "learning_rate": 3.2670882014890085e-06, "loss": 0.7817657, "num_input_tokens_seen": 101120480, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.5625, "step": 4685, "time_per_iteration": 3.7422001361846924 }, { "auxiliary_loss_clip": 0.0107851, "auxiliary_loss_mlp": 0.01031878, "balance_loss_clip": 1.01596391, "balance_loss_mlp": 1.02470148, "epoch": 0.2817375620021043, "flos": 25300927491840.0, "grad_norm": 1.4273435716694678, "language_loss": 0.75569367, "learning_rate": 3.2667958552601747e-06, "loss": 0.77679753, "num_input_tokens_seen": 101142910, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.5390625, "step": 4686, "time_per_iteration": 2.422046422958374 }, { "auxiliary_loss_clip": 0.01086003, "auxiliary_loss_mlp": 0.01037863, "balance_loss_clip": 1.018682, "balance_loss_mlp": 1.02572322, "epoch": 0.2817976852547723, "flos": 18692358013440.0, "grad_norm": 2.7393855498237363, "language_loss": 0.63064349, "learning_rate": 3.266503463822655e-06, "loss": 0.65188217, "num_input_tokens_seen": 101160030, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.6015625, "step": 4687, "time_per_iteration": 2.3531594276428223 }, { "auxiliary_loss_clip": 0.01083639, "auxiliary_loss_mlp": 0.01037333, "balance_loss_clip": 1.02036977, "balance_loss_mlp": 1.02557552, "epoch": 0.28185780850744024, "flos": 22741305408000.0, "grad_norm": 2.474246556760466, "language_loss": 0.7603215, "learning_rate": 3.266211027186884e-06, "loss": 0.78153127, "num_input_tokens_seen": 101177675, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.578125, "step": 4688, "time_per_iteration": 2.3718762397766113 }, { "auxiliary_loss_clip": 0.0108024, "auxiliary_loss_mlp": 0.01034938, "balance_loss_clip": 1.01889277, "balance_loss_mlp": 1.02497983, "epoch": 0.2819179317601082, "flos": 14933189836800.0, "grad_norm": 2.014022123426822, "language_loss": 0.78435707, "learning_rate": 3.265918545363299e-06, "loss": 0.80550891, "num_input_tokens_seen": 101192225, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5546875, "step": 4689, "time_per_iteration": 2.3210413455963135 }, { "auxiliary_loss_clip": 0.010826, "auxiliary_loss_mlp": 0.0103214, "balance_loss_clip": 1.01487827, "balance_loss_mlp": 1.02557874, "epoch": 0.2819780550127762, "flos": 23144307765120.0, "grad_norm": 1.8974032460474843, "language_loss": 0.78310406, "learning_rate": 3.2656260183623373e-06, "loss": 0.80425143, "num_input_tokens_seen": 101210870, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.5703125, "step": 4690, "time_per_iteration": 2.3867547512054443 }, { "auxiliary_loss_clip": 0.01082281, "auxiliary_loss_mlp": 0.0103524, "balance_loss_clip": 1.0188365, "balance_loss_mlp": 1.02418804, "epoch": 0.28203817826544414, "flos": 21615286371840.0, "grad_norm": 2.967278175848558, "language_loss": 0.88040459, "learning_rate": 3.265333446194439e-06, "loss": 0.90157986, "num_input_tokens_seen": 101229965, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.58203125, "step": 4691, "time_per_iteration": 2.373430013656616 }, { "auxiliary_loss_clip": 0.010855, "auxiliary_loss_mlp": 0.01038103, "balance_loss_clip": 1.0203526, "balance_loss_mlp": 1.02565289, "epoch": 0.2820983015181121, "flos": 24025585104000.0, "grad_norm": 1.602629082049501, "language_loss": 0.81951904, "learning_rate": 3.2650408288700442e-06, "loss": 0.84075511, "num_input_tokens_seen": 101250980, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.59765625, "step": 4692, "time_per_iteration": 2.4163925647735596 }, { "auxiliary_loss_clip": 0.01082316, "auxiliary_loss_mlp": 0.01037073, "balance_loss_clip": 1.01933467, "balance_loss_mlp": 1.02429128, "epoch": 0.2821584247707801, "flos": 30006626071680.0, "grad_norm": 1.530449377087271, "language_loss": 0.74562776, "learning_rate": 3.264748166399596e-06, "loss": 0.76682162, "num_input_tokens_seen": 101273335, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.578125, "step": 4693, "time_per_iteration": 2.4468326568603516 }, { "auxiliary_loss_clip": 0.01083424, "auxiliary_loss_mlp": 0.01029993, "balance_loss_clip": 1.01282704, "balance_loss_mlp": 1.02475452, "epoch": 0.2822185480234481, "flos": 21395717631360.0, "grad_norm": 1.5564974454766827, "language_loss": 0.77406073, "learning_rate": 3.2644554587935397e-06, "loss": 0.79519486, "num_input_tokens_seen": 101292110, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.5859375, "step": 4694, "time_per_iteration": 2.374176502227783 }, { "auxiliary_loss_clip": 0.01080679, "auxiliary_loss_mlp": 0.0103188, "balance_loss_clip": 1.0154531, "balance_loss_mlp": 1.02528489, "epoch": 0.28227867127611606, "flos": 27451612287360.0, "grad_norm": 2.4200896891866197, "language_loss": 0.66699767, "learning_rate": 3.2641627060623205e-06, "loss": 0.68812329, "num_input_tokens_seen": 101312815, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.5546875, "step": 4695, "time_per_iteration": 2.4323978424072266 }, { "auxiliary_loss_clip": 0.010852, "auxiliary_loss_mlp": 0.01038959, "balance_loss_clip": 1.02069628, "balance_loss_mlp": 1.02541971, "epoch": 0.282338794528784, "flos": 22592854840320.0, "grad_norm": 1.9522033232109488, "language_loss": 0.75408053, "learning_rate": 3.263869908216387e-06, "loss": 0.7753222, "num_input_tokens_seen": 101329045, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.59765625, "step": 4696, "time_per_iteration": 2.363227367401123 }, { "auxiliary_loss_clip": 0.01084991, "auxiliary_loss_mlp": 0.01038945, "balance_loss_clip": 1.02125454, "balance_loss_mlp": 1.02661145, "epoch": 0.282398917781452, "flos": 42009311784960.0, "grad_norm": 1.8101574755389118, "language_loss": 0.62200546, "learning_rate": 3.2635770652661866e-06, "loss": 0.64324486, "num_input_tokens_seen": 101352715, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.58203125, "step": 4697, "time_per_iteration": 2.547203540802002 }, { "auxiliary_loss_clip": 0.01077729, "auxiliary_loss_mlp": 0.01031658, "balance_loss_clip": 1.01658988, "balance_loss_mlp": 1.02276564, "epoch": 0.28245904103411995, "flos": 23223525373440.0, "grad_norm": 1.526081382041632, "language_loss": 0.73026216, "learning_rate": 3.263284177222171e-06, "loss": 0.75135601, "num_input_tokens_seen": 101374640, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.55078125, "step": 4698, "time_per_iteration": 2.438342809677124 }, { "auxiliary_loss_clip": 0.0107942, "auxiliary_loss_mlp": 0.01034195, "balance_loss_clip": 1.01820874, "balance_loss_mlp": 1.02436626, "epoch": 0.2825191642867879, "flos": 25373442119040.0, "grad_norm": 2.5472219262396782, "language_loss": 0.74780232, "learning_rate": 3.2629912440947927e-06, "loss": 0.76893842, "num_input_tokens_seen": 101393595, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.55078125, "step": 4699, "time_per_iteration": 2.4066805839538574 }, { "auxiliary_loss_clip": 0.01084279, "auxiliary_loss_mlp": 0.01033867, "balance_loss_clip": 1.01485324, "balance_loss_mlp": 1.02545559, "epoch": 0.2825792875394559, "flos": 17235886158720.0, "grad_norm": 3.0409712150223758, "language_loss": 0.79499203, "learning_rate": 3.262698265894506e-06, "loss": 0.81617349, "num_input_tokens_seen": 101409265, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.58984375, "step": 4700, "time_per_iteration": 2.3409626483917236 }, { "auxiliary_loss_clip": 0.01076736, "auxiliary_loss_mlp": 0.01032518, "balance_loss_clip": 1.01746178, "balance_loss_mlp": 1.02353406, "epoch": 0.28263941079212385, "flos": 26722765411200.0, "grad_norm": 1.7282565623755062, "language_loss": 0.81723988, "learning_rate": 3.2624052426317664e-06, "loss": 0.83833241, "num_input_tokens_seen": 101428365, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.53125, "step": 4701, "time_per_iteration": 2.4084768295288086 }, { "auxiliary_loss_clip": 0.01081669, "auxiliary_loss_mlp": 0.01029812, "balance_loss_clip": 1.01359963, "balance_loss_mlp": 1.02464962, "epoch": 0.2826995340447918, "flos": 25920147098880.0, "grad_norm": 2.3292537520678533, "language_loss": 0.73631829, "learning_rate": 3.26211217431703e-06, "loss": 0.75743312, "num_input_tokens_seen": 101447280, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.5703125, "step": 4702, "time_per_iteration": 2.4375617504119873 }, { "auxiliary_loss_clip": 0.01081064, "auxiliary_loss_mlp": 0.01032388, "balance_loss_clip": 1.01566291, "balance_loss_mlp": 1.02562058, "epoch": 0.2827596572974598, "flos": 22378697360640.0, "grad_norm": 6.575249261037677, "language_loss": 0.78327727, "learning_rate": 3.2618190609607577e-06, "loss": 0.80441177, "num_input_tokens_seen": 101465435, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.5546875, "step": 4703, "time_per_iteration": 2.3829078674316406 }, { "auxiliary_loss_clip": 0.01078783, "auxiliary_loss_mlp": 0.01035038, "balance_loss_clip": 1.01760912, "balance_loss_mlp": 1.02337193, "epoch": 0.28281978055012774, "flos": 33545736748800.0, "grad_norm": 1.6192427527875588, "language_loss": 0.69177246, "learning_rate": 3.2615259025734085e-06, "loss": 0.71291065, "num_input_tokens_seen": 101486355, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.5546875, "step": 4704, "time_per_iteration": 2.495262861251831 }, { "auxiliary_loss_clip": 0.0108131, "auxiliary_loss_mlp": 0.01034461, "balance_loss_clip": 1.0182128, "balance_loss_mlp": 1.02644086, "epoch": 0.2828799038027957, "flos": 23439742623360.0, "grad_norm": 3.3627548595005825, "language_loss": 0.69745326, "learning_rate": 3.261232699165445e-06, "loss": 0.71861094, "num_input_tokens_seen": 101505875, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.546875, "step": 4705, "time_per_iteration": 2.387277603149414 }, { "auxiliary_loss_clip": 0.01016274, "auxiliary_loss_mlp": 0.01002163, "balance_loss_clip": 1.00050628, "balance_loss_mlp": 1.00312138, "epoch": 0.2829400270554637, "flos": 69870629410560.0, "grad_norm": 0.7218649370869256, "language_loss": 0.59269124, "learning_rate": 3.2609394507473305e-06, "loss": 0.61287564, "num_input_tokens_seen": 101565045, "router_z_loss_clip": 0.01660156, "router_z_loss_mlp": 0.13183594, "step": 4706, "time_per_iteration": 3.0769612789154053 }, { "auxiliary_loss_clip": 0.01077743, "auxiliary_loss_mlp": 0.01027441, "balance_loss_clip": 1.01323092, "balance_loss_mlp": 1.02472639, "epoch": 0.2830001503081317, "flos": 24787913840640.0, "grad_norm": 2.506359337605025, "language_loss": 0.82300955, "learning_rate": 3.2606461573295303e-06, "loss": 0.84406137, "num_input_tokens_seen": 101585825, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.53125, "step": 4707, "time_per_iteration": 2.4101438522338867 }, { "auxiliary_loss_clip": 0.01084681, "auxiliary_loss_mlp": 0.01033054, "balance_loss_clip": 1.01473212, "balance_loss_mlp": 1.02583933, "epoch": 0.28306027356079966, "flos": 27668248473600.0, "grad_norm": 1.639820805608009, "language_loss": 0.80289948, "learning_rate": 3.260352818922512e-06, "loss": 0.82407683, "num_input_tokens_seen": 101606105, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.5859375, "step": 4708, "time_per_iteration": 2.4202804565429688 }, { "auxiliary_loss_clip": 0.01015369, "auxiliary_loss_mlp": 0.01001945, "balance_loss_clip": 1.00024033, "balance_loss_mlp": 1.002424, "epoch": 0.2831203968134676, "flos": 60525288276480.0, "grad_norm": 0.9240460877029365, "language_loss": 0.62830722, "learning_rate": 3.2600594355367434e-06, "loss": 0.64848042, "num_input_tokens_seen": 101656875, "router_z_loss_clip": 0.01708984, "router_z_loss_mlp": 0.12890625, "step": 4709, "time_per_iteration": 2.9079742431640625 }, { "auxiliary_loss_clip": 0.01079717, "auxiliary_loss_mlp": 0.01031115, "balance_loss_clip": 1.01475906, "balance_loss_mlp": 1.02518725, "epoch": 0.2831805200661356, "flos": 22053690714240.0, "grad_norm": 1.3603003737777664, "language_loss": 0.73955834, "learning_rate": 3.259766007182695e-06, "loss": 0.76066661, "num_input_tokens_seen": 101676225, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.54296875, "step": 4710, "time_per_iteration": 2.3806557655334473 }, { "auxiliary_loss_clip": 0.01081722, "auxiliary_loss_mlp": 0.01042753, "balance_loss_clip": 1.02480066, "balance_loss_mlp": 1.0248003, "epoch": 0.28324064331880355, "flos": 22599592732800.0, "grad_norm": 1.7428122146726683, "language_loss": 0.79544723, "learning_rate": 3.259472533870838e-06, "loss": 0.81669199, "num_input_tokens_seen": 101693710, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.5703125, "step": 4711, "time_per_iteration": 2.3633670806884766 }, { "auxiliary_loss_clip": 0.01082922, "auxiliary_loss_mlp": 0.01031946, "balance_loss_clip": 1.01463687, "balance_loss_mlp": 1.02554917, "epoch": 0.2833007665714715, "flos": 30402960359040.0, "grad_norm": 4.880572078214499, "language_loss": 0.70977688, "learning_rate": 3.2591790156116466e-06, "loss": 0.73092556, "num_input_tokens_seen": 101714010, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.57421875, "step": 4712, "time_per_iteration": 2.441821575164795 }, { "auxiliary_loss_clip": 0.01082213, "auxiliary_loss_mlp": 0.01039373, "balance_loss_clip": 1.0224216, "balance_loss_mlp": 1.02480936, "epoch": 0.2833608898241395, "flos": 23548392374400.0, "grad_norm": 1.8208215553892926, "language_loss": 0.81730092, "learning_rate": 3.258885452415595e-06, "loss": 0.83851677, "num_input_tokens_seen": 101732995, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.57421875, "step": 4713, "time_per_iteration": 2.383963108062744 }, { "auxiliary_loss_clip": 0.01079658, "auxiliary_loss_mlp": 0.01031279, "balance_loss_clip": 1.01531672, "balance_loss_mlp": 1.02457273, "epoch": 0.28342101307680745, "flos": 20265683788800.0, "grad_norm": 1.8461358395380656, "language_loss": 0.75691658, "learning_rate": 3.2585918442931595e-06, "loss": 0.77802593, "num_input_tokens_seen": 101751385, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.55078125, "step": 4714, "time_per_iteration": 2.373971462249756 }, { "auxiliary_loss_clip": 0.01080895, "auxiliary_loss_mlp": 0.01037827, "balance_loss_clip": 1.02190018, "balance_loss_mlp": 1.02488649, "epoch": 0.2834811363294754, "flos": 30845728621440.0, "grad_norm": 1.3725109146054504, "language_loss": 0.7819891, "learning_rate": 3.258298191254818e-06, "loss": 0.80317628, "num_input_tokens_seen": 101773825, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.5625, "step": 4715, "time_per_iteration": 3.8670871257781982 }, { "auxiliary_loss_clip": 0.0108006, "auxiliary_loss_mlp": 0.01029352, "balance_loss_clip": 1.01266277, "balance_loss_mlp": 1.02458525, "epoch": 0.2835412595821434, "flos": 22709918229120.0, "grad_norm": 2.1575345564238635, "language_loss": 0.73499966, "learning_rate": 3.2580044933110513e-06, "loss": 0.75609374, "num_input_tokens_seen": 101791920, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.5546875, "step": 4716, "time_per_iteration": 2.385983467102051 }, { "auxiliary_loss_clip": 0.01083392, "auxiliary_loss_mlp": 0.01035076, "balance_loss_clip": 1.01724267, "balance_loss_mlp": 1.02334297, "epoch": 0.28360138283481134, "flos": 18076734276480.0, "grad_norm": 2.7059929247232484, "language_loss": 0.74639773, "learning_rate": 3.2577107504723403e-06, "loss": 0.76758242, "num_input_tokens_seen": 101809515, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.6015625, "step": 4717, "time_per_iteration": 2.3544490337371826 }, { "auxiliary_loss_clip": 0.01081762, "auxiliary_loss_mlp": 0.01039139, "balance_loss_clip": 1.02196097, "balance_loss_mlp": 1.02436793, "epoch": 0.2836615060874793, "flos": 17853918779520.0, "grad_norm": 1.5746816927948468, "language_loss": 0.66727597, "learning_rate": 3.2574169627491683e-06, "loss": 0.68848491, "num_input_tokens_seen": 101827735, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.57421875, "step": 4718, "time_per_iteration": 3.727241039276123 }, { "auxiliary_loss_clip": 0.01082786, "auxiliary_loss_mlp": 0.01035875, "balance_loss_clip": 1.01866162, "balance_loss_mlp": 1.02434754, "epoch": 0.2837216293401473, "flos": 25739087454720.0, "grad_norm": 1.7899510086008625, "language_loss": 0.71910429, "learning_rate": 3.2571231301520187e-06, "loss": 0.74029094, "num_input_tokens_seen": 101845970, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.5859375, "step": 4719, "time_per_iteration": 2.4185657501220703 }, { "auxiliary_loss_clip": 0.01081545, "auxiliary_loss_mlp": 0.01035816, "balance_loss_clip": 1.01876903, "balance_loss_mlp": 1.02574909, "epoch": 0.2837817525928153, "flos": 20922469885440.0, "grad_norm": 1.6717654750465205, "language_loss": 0.80072182, "learning_rate": 3.2568292526913785e-06, "loss": 0.82189548, "num_input_tokens_seen": 101865040, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.55859375, "step": 4720, "time_per_iteration": 2.3804264068603516 }, { "auxiliary_loss_clip": 0.01082047, "auxiliary_loss_mlp": 0.01031439, "balance_loss_clip": 1.01422524, "balance_loss_mlp": 1.02523673, "epoch": 0.28384187584548326, "flos": 18915697180800.0, "grad_norm": 1.9430720799553722, "language_loss": 0.79105258, "learning_rate": 3.2565353303777353e-06, "loss": 0.81218743, "num_input_tokens_seen": 101883735, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.56640625, "step": 4721, "time_per_iteration": 2.3740267753601074 }, { "auxiliary_loss_clip": 0.01080163, "auxiliary_loss_mlp": 0.01032042, "balance_loss_clip": 1.01484036, "balance_loss_mlp": 1.0245564, "epoch": 0.2839019990981512, "flos": 27342753068160.0, "grad_norm": 3.506974492931057, "language_loss": 0.82741618, "learning_rate": 3.256241363221578e-06, "loss": 0.84853828, "num_input_tokens_seen": 101903025, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.5546875, "step": 4722, "time_per_iteration": 3.805864095687866 }, { "auxiliary_loss_clip": 0.0108301, "auxiliary_loss_mlp": 0.01033363, "balance_loss_clip": 1.01716208, "balance_loss_mlp": 1.02534068, "epoch": 0.2839621223508192, "flos": 18113323248000.0, "grad_norm": 1.5526628097344308, "language_loss": 0.69962299, "learning_rate": 3.2559473512333986e-06, "loss": 0.72078663, "num_input_tokens_seen": 101922255, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.57421875, "step": 4723, "time_per_iteration": 2.376183032989502 }, { "auxiliary_loss_clip": 0.01081209, "auxiliary_loss_mlp": 0.01028522, "balance_loss_clip": 1.01126075, "balance_loss_mlp": 1.02463102, "epoch": 0.28402224560348716, "flos": 26357189898240.0, "grad_norm": 5.87236341881161, "language_loss": 0.78273642, "learning_rate": 3.2556532944236886e-06, "loss": 0.80383372, "num_input_tokens_seen": 101943100, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.56640625, "step": 4724, "time_per_iteration": 2.416632890701294 }, { "auxiliary_loss_clip": 0.01083681, "auxiliary_loss_mlp": 0.01035033, "balance_loss_clip": 1.01827192, "balance_loss_mlp": 1.02569687, "epoch": 0.2840823688561551, "flos": 24059660457600.0, "grad_norm": 3.129995104210972, "language_loss": 0.92471337, "learning_rate": 3.2553591928029423e-06, "loss": 0.94590056, "num_input_tokens_seen": 101963160, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.578125, "step": 4725, "time_per_iteration": 3.7411210536956787 }, { "auxiliary_loss_clip": 0.0108079, "auxiliary_loss_mlp": 0.01032894, "balance_loss_clip": 1.0165149, "balance_loss_mlp": 1.0248549, "epoch": 0.2841424921088231, "flos": 29458559548800.0, "grad_norm": 1.5321762783084771, "language_loss": 0.88700426, "learning_rate": 3.2550650463816557e-06, "loss": 0.90814114, "num_input_tokens_seen": 101984300, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.55859375, "step": 4726, "time_per_iteration": 2.452873945236206 }, { "auxiliary_loss_clip": 0.01084567, "auxiliary_loss_mlp": 0.01034398, "balance_loss_clip": 1.01754165, "balance_loss_mlp": 1.02653563, "epoch": 0.28420261536149105, "flos": 48098688301440.0, "grad_norm": 2.085717828183791, "language_loss": 0.78557849, "learning_rate": 3.2547708551703256e-06, "loss": 0.80676812, "num_input_tokens_seen": 102005765, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.58203125, "step": 4727, "time_per_iteration": 2.6010992527008057 }, { "auxiliary_loss_clip": 0.01078461, "auxiliary_loss_mlp": 0.01027081, "balance_loss_clip": 1.01164365, "balance_loss_mlp": 1.02441645, "epoch": 0.284262738614159, "flos": 25664966904960.0, "grad_norm": 2.181236525354293, "language_loss": 0.6623621, "learning_rate": 3.254476619179452e-06, "loss": 0.68341756, "num_input_tokens_seen": 102022755, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.5390625, "step": 4728, "time_per_iteration": 2.397697687149048 }, { "auxiliary_loss_clip": 0.01078698, "auxiliary_loss_mlp": 0.01035251, "balance_loss_clip": 1.01803684, "balance_loss_mlp": 1.022825, "epoch": 0.284322861866827, "flos": 19717966379520.0, "grad_norm": 2.0880613096562373, "language_loss": 0.76450562, "learning_rate": 3.2541823384195344e-06, "loss": 0.78564513, "num_input_tokens_seen": 102041850, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.55859375, "step": 4729, "time_per_iteration": 2.364349126815796 }, { "auxiliary_loss_clip": 0.01085252, "auxiliary_loss_mlp": 0.01034272, "balance_loss_clip": 1.01569974, "balance_loss_mlp": 1.02589154, "epoch": 0.28438298511949495, "flos": 23914107532800.0, "grad_norm": 2.1033505537452823, "language_loss": 0.66414917, "learning_rate": 3.253888012901075e-06, "loss": 0.6853444, "num_input_tokens_seen": 102059500, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.59375, "step": 4730, "time_per_iteration": 2.398761034011841 }, { "auxiliary_loss_clip": 0.01080779, "auxiliary_loss_mlp": 0.01029782, "balance_loss_clip": 1.01216245, "balance_loss_mlp": 1.02532625, "epoch": 0.2844431083721629, "flos": 26066153871360.0, "grad_norm": 1.6757021520393514, "language_loss": 0.7444194, "learning_rate": 3.253593642634578e-06, "loss": 0.76552504, "num_input_tokens_seen": 102080460, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.5546875, "step": 4731, "time_per_iteration": 2.4312949180603027 }, { "auxiliary_loss_clip": 0.01079736, "auxiliary_loss_mlp": 0.01027994, "balance_loss_clip": 1.01224649, "balance_loss_mlp": 1.02410972, "epoch": 0.2845032316248309, "flos": 25809297932160.0, "grad_norm": 1.3945684550033184, "language_loss": 0.8347922, "learning_rate": 3.2532992276305492e-06, "loss": 0.85586947, "num_input_tokens_seen": 102100950, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.5546875, "step": 4732, "time_per_iteration": 2.4226808547973633 }, { "auxiliary_loss_clip": 0.01083088, "auxiliary_loss_mlp": 0.01041881, "balance_loss_clip": 1.02299786, "balance_loss_mlp": 1.02479708, "epoch": 0.2845633548774989, "flos": 19822322033280.0, "grad_norm": 1.603730583896771, "language_loss": 0.78541684, "learning_rate": 3.253004767899494e-06, "loss": 0.80666649, "num_input_tokens_seen": 102119345, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.5859375, "step": 4733, "time_per_iteration": 2.3851988315582275 }, { "auxiliary_loss_clip": 0.01086631, "auxiliary_loss_mlp": 0.0104098, "balance_loss_clip": 1.02204967, "balance_loss_mlp": 1.02597797, "epoch": 0.28462347813016686, "flos": 23181769520640.0, "grad_norm": 3.6474735654880055, "language_loss": 0.71310365, "learning_rate": 3.252710263451922e-06, "loss": 0.73437977, "num_input_tokens_seen": 102139050, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.609375, "step": 4734, "time_per_iteration": 2.4062321186065674 }, { "auxiliary_loss_clip": 0.01078318, "auxiliary_loss_mlp": 0.01030252, "balance_loss_clip": 1.01302624, "balance_loss_mlp": 1.02225661, "epoch": 0.2846836013828348, "flos": 18659504557440.0, "grad_norm": 1.7286046932569703, "language_loss": 0.74091792, "learning_rate": 3.2524157142983432e-06, "loss": 0.7620036, "num_input_tokens_seen": 102157935, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.55859375, "step": 4735, "time_per_iteration": 2.3742308616638184 }, { "auxiliary_loss_clip": 0.01081223, "auxiliary_loss_mlp": 0.01036929, "balance_loss_clip": 1.02143192, "balance_loss_mlp": 1.02526176, "epoch": 0.2847437246355028, "flos": 14172641579520.0, "grad_norm": 1.6945225941925561, "language_loss": 0.79613048, "learning_rate": 3.252121120449269e-06, "loss": 0.817312, "num_input_tokens_seen": 102175325, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.55859375, "step": 4736, "time_per_iteration": 2.3473095893859863 }, { "auxiliary_loss_clip": 0.0108318, "auxiliary_loss_mlp": 0.01036294, "balance_loss_clip": 1.01933074, "balance_loss_mlp": 1.02610743, "epoch": 0.28480384788817076, "flos": 29277080968320.0, "grad_norm": 2.2565295857640995, "language_loss": 0.59148276, "learning_rate": 3.251826481915213e-06, "loss": 0.61267751, "num_input_tokens_seen": 102196625, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.5703125, "step": 4737, "time_per_iteration": 2.468946695327759 }, { "auxiliary_loss_clip": 0.01078242, "auxiliary_loss_mlp": 0.01032174, "balance_loss_clip": 1.0163312, "balance_loss_mlp": 1.02363682, "epoch": 0.2848639711408387, "flos": 22600221137280.0, "grad_norm": 2.4602628885469082, "language_loss": 0.86252427, "learning_rate": 3.2515317987066894e-06, "loss": 0.88362849, "num_input_tokens_seen": 102214975, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.546875, "step": 4738, "time_per_iteration": 2.3778250217437744 }, { "auxiliary_loss_clip": 0.01086371, "auxiliary_loss_mlp": 0.01039284, "balance_loss_clip": 1.021415, "balance_loss_mlp": 1.02600527, "epoch": 0.2849240943935067, "flos": 17598598940160.0, "grad_norm": 2.3619050421880208, "language_loss": 0.89823699, "learning_rate": 3.2512370708342155e-06, "loss": 0.91949356, "num_input_tokens_seen": 102231885, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.60546875, "step": 4739, "time_per_iteration": 2.3856139183044434 }, { "auxiliary_loss_clip": 0.01084186, "auxiliary_loss_mlp": 0.01035109, "balance_loss_clip": 1.0193857, "balance_loss_mlp": 1.02761984, "epoch": 0.28498421764617465, "flos": 24861440897280.0, "grad_norm": 1.3593881426134444, "language_loss": 0.72196352, "learning_rate": 3.25094229830831e-06, "loss": 0.74315643, "num_input_tokens_seen": 102252725, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.56640625, "step": 4740, "time_per_iteration": 2.405941963195801 }, { "auxiliary_loss_clip": 0.01079864, "auxiliary_loss_mlp": 0.01027855, "balance_loss_clip": 1.0124476, "balance_loss_mlp": 1.02569914, "epoch": 0.2850443408988426, "flos": 22781490249600.0, "grad_norm": 1.5660180677894475, "language_loss": 0.77726334, "learning_rate": 3.2506474811394907e-06, "loss": 0.79834056, "num_input_tokens_seen": 102271730, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.5390625, "step": 4741, "time_per_iteration": 2.409609079360962 }, { "auxiliary_loss_clip": 0.01082562, "auxiliary_loss_mlp": 0.01031153, "balance_loss_clip": 1.01353347, "balance_loss_mlp": 1.02279437, "epoch": 0.2851044641515106, "flos": 18843042908160.0, "grad_norm": 1.938301208297884, "language_loss": 0.75858092, "learning_rate": 3.2503526193382796e-06, "loss": 0.77971804, "num_input_tokens_seen": 102291325, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.59765625, "step": 4742, "time_per_iteration": 2.355731725692749 }, { "auxiliary_loss_clip": 0.01087653, "auxiliary_loss_mlp": 0.01040676, "balance_loss_clip": 1.02131653, "balance_loss_mlp": 1.02617121, "epoch": 0.28516458740417855, "flos": 18879492234240.0, "grad_norm": 2.874478916625614, "language_loss": 0.57906497, "learning_rate": 3.2500577129152004e-06, "loss": 0.60034823, "num_input_tokens_seen": 102309000, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.6171875, "step": 4743, "time_per_iteration": 2.367086172103882 }, { "auxiliary_loss_clip": 0.01086354, "auxiliary_loss_mlp": 0.01034554, "balance_loss_clip": 1.0167799, "balance_loss_mlp": 1.02716076, "epoch": 0.2852247106568465, "flos": 25298693164800.0, "grad_norm": 1.6172933456088672, "language_loss": 0.73978841, "learning_rate": 3.2497627618807767e-06, "loss": 0.76099747, "num_input_tokens_seen": 102329240, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.58984375, "step": 4744, "time_per_iteration": 2.4027342796325684 }, { "auxiliary_loss_clip": 0.01080479, "auxiliary_loss_mlp": 0.01033434, "balance_loss_clip": 1.01743662, "balance_loss_mlp": 1.02479768, "epoch": 0.2852848339095145, "flos": 11654600791680.0, "grad_norm": 2.9323421196817407, "language_loss": 0.77296233, "learning_rate": 3.2494677662455355e-06, "loss": 0.79410142, "num_input_tokens_seen": 102344440, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5546875, "step": 4745, "time_per_iteration": 2.360199213027954 }, { "auxiliary_loss_clip": 0.01078276, "auxiliary_loss_mlp": 0.01031276, "balance_loss_clip": 1.01620817, "balance_loss_mlp": 1.0251931, "epoch": 0.2853449571621825, "flos": 12932386974720.0, "grad_norm": 1.674431880746482, "language_loss": 0.82517254, "learning_rate": 3.249172726020003e-06, "loss": 0.84626806, "num_input_tokens_seen": 102360985, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.53125, "step": 4746, "time_per_iteration": 2.323788642883301 }, { "auxiliary_loss_clip": 0.01084347, "auxiliary_loss_mlp": 0.01032491, "balance_loss_clip": 1.01456237, "balance_loss_mlp": 1.02449775, "epoch": 0.28540508041485046, "flos": 20009560988160.0, "grad_norm": 1.7130432783013392, "language_loss": 0.79733384, "learning_rate": 3.248877641214709e-06, "loss": 0.81850219, "num_input_tokens_seen": 102380320, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.6015625, "step": 4747, "time_per_iteration": 2.3920280933380127 }, { "auxiliary_loss_clip": 0.01015087, "auxiliary_loss_mlp": 0.01007947, "balance_loss_clip": 1.00650477, "balance_loss_mlp": 1.00216842, "epoch": 0.28546520366751843, "flos": 68135864175360.0, "grad_norm": 0.7816728800379726, "language_loss": 0.60464311, "learning_rate": 3.248582511840185e-06, "loss": 0.6248734, "num_input_tokens_seen": 102439140, "router_z_loss_clip": 0.0144043, "router_z_loss_mlp": 0.12890625, "step": 4748, "time_per_iteration": 2.9779348373413086 }, { "auxiliary_loss_clip": 0.01081255, "auxiliary_loss_mlp": 0.01037785, "balance_loss_clip": 1.01980829, "balance_loss_mlp": 1.02390397, "epoch": 0.2855253269201864, "flos": 13250969930880.0, "grad_norm": 3.095614863227557, "language_loss": 0.8050099, "learning_rate": 3.2482873379069627e-06, "loss": 0.82620031, "num_input_tokens_seen": 102450990, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.5703125, "step": 4749, "time_per_iteration": 2.3364624977111816 }, { "auxiliary_loss_clip": 0.01081563, "auxiliary_loss_mlp": 0.01032524, "balance_loss_clip": 1.01670456, "balance_loss_mlp": 1.02490294, "epoch": 0.28558545017285436, "flos": 28619631555840.0, "grad_norm": 1.907682446751592, "language_loss": 0.70536107, "learning_rate": 3.2479921194255764e-06, "loss": 0.72650194, "num_input_tokens_seen": 102471820, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.56640625, "step": 4750, "time_per_iteration": 2.4354352951049805 }, { "auxiliary_loss_clip": 0.0108143, "auxiliary_loss_mlp": 0.01031612, "balance_loss_clip": 1.01534581, "balance_loss_mlp": 1.02665794, "epoch": 0.2856455734255223, "flos": 34129065611520.0, "grad_norm": 2.2333435843809397, "language_loss": 0.81769609, "learning_rate": 3.2476968564065613e-06, "loss": 0.83882642, "num_input_tokens_seen": 102492625, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.546875, "step": 4751, "time_per_iteration": 2.516223907470703 }, { "auxiliary_loss_clip": 0.01080285, "auxiliary_loss_mlp": 0.01030586, "balance_loss_clip": 1.01438582, "balance_loss_mlp": 1.02485085, "epoch": 0.2857056966781903, "flos": 39784576262400.0, "grad_norm": 1.9672147354921232, "language_loss": 0.79674977, "learning_rate": 3.247401548860455e-06, "loss": 0.81785846, "num_input_tokens_seen": 102514145, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.5546875, "step": 4752, "time_per_iteration": 2.5313777923583984 }, { "auxiliary_loss_clip": 0.01081205, "auxiliary_loss_mlp": 0.01032259, "balance_loss_clip": 1.01484227, "balance_loss_mlp": 1.02471709, "epoch": 0.28576581993085826, "flos": 21871199704320.0, "grad_norm": 1.7690911339342643, "language_loss": 0.78648049, "learning_rate": 3.247106196797796e-06, "loss": 0.8076151, "num_input_tokens_seen": 102532365, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.5625, "step": 4753, "time_per_iteration": 2.3985986709594727 }, { "auxiliary_loss_clip": 0.01085125, "auxiliary_loss_mlp": 0.01037552, "balance_loss_clip": 1.0188477, "balance_loss_mlp": 1.02637315, "epoch": 0.2858259431835262, "flos": 19090856805120.0, "grad_norm": 2.0545765545697146, "language_loss": 0.89546752, "learning_rate": 3.2468108002291256e-06, "loss": 0.91669428, "num_input_tokens_seen": 102548425, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.5859375, "step": 4754, "time_per_iteration": 3.737626791000366 }, { "auxiliary_loss_clip": 0.01080394, "auxiliary_loss_mlp": 0.01036977, "balance_loss_clip": 1.02078843, "balance_loss_mlp": 1.02513647, "epoch": 0.2858860664361942, "flos": 20333415559680.0, "grad_norm": 2.178307506385036, "language_loss": 0.82149816, "learning_rate": 3.2465153591649835e-06, "loss": 0.84267181, "num_input_tokens_seen": 102566370, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.55078125, "step": 4755, "time_per_iteration": 2.388486623764038 }, { "auxiliary_loss_clip": 0.01081748, "auxiliary_loss_mlp": 0.01034665, "balance_loss_clip": 1.01823783, "balance_loss_mlp": 1.02429199, "epoch": 0.28594618968886215, "flos": 24460603044480.0, "grad_norm": 1.4870991408952334, "language_loss": 0.83555698, "learning_rate": 3.2462198736159157e-06, "loss": 0.85672116, "num_input_tokens_seen": 102588715, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.57421875, "step": 4756, "time_per_iteration": 2.4245755672454834 }, { "auxiliary_loss_clip": 0.01016326, "auxiliary_loss_mlp": 0.01005963, "balance_loss_clip": 1.00446093, "balance_loss_mlp": 1.00319099, "epoch": 0.2860063129415301, "flos": 71648964023040.0, "grad_norm": 0.8636197065724464, "language_loss": 0.61008024, "learning_rate": 3.245924343592466e-06, "loss": 0.63030314, "num_input_tokens_seen": 102656715, "router_z_loss_clip": 0.01501465, "router_z_loss_mlp": 0.13085938, "step": 4757, "time_per_iteration": 3.183072566986084 }, { "auxiliary_loss_clip": 0.01081288, "auxiliary_loss_mlp": 0.01030334, "balance_loss_clip": 1.01418066, "balance_loss_mlp": 1.02508998, "epoch": 0.2860664361941981, "flos": 20557627511040.0, "grad_norm": 1.9371765635545932, "language_loss": 0.65740705, "learning_rate": 3.2456287691051815e-06, "loss": 0.67852324, "num_input_tokens_seen": 102676545, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.5625, "step": 4758, "time_per_iteration": 3.7661352157592773 }, { "auxiliary_loss_clip": 0.01083954, "auxiliary_loss_mlp": 0.01034487, "balance_loss_clip": 1.01677227, "balance_loss_mlp": 1.02530575, "epoch": 0.2861265594468661, "flos": 35994788956800.0, "grad_norm": 1.368761470475045, "language_loss": 0.62679696, "learning_rate": 3.24533315016461e-06, "loss": 0.64798141, "num_input_tokens_seen": 102702875, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.5859375, "step": 4759, "time_per_iteration": 2.5463781356811523 }, { "auxiliary_loss_clip": 0.01080846, "auxiliary_loss_mlp": 0.01034805, "balance_loss_clip": 1.01798511, "balance_loss_mlp": 1.02552783, "epoch": 0.28618668269953407, "flos": 20046394339200.0, "grad_norm": 2.008926840236646, "language_loss": 0.73942149, "learning_rate": 3.245037486781302e-06, "loss": 0.76057804, "num_input_tokens_seen": 102723160, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.5546875, "step": 4760, "time_per_iteration": 2.407135009765625 }, { "auxiliary_loss_clip": 0.01081765, "auxiliary_loss_mlp": 0.01033092, "balance_loss_clip": 1.01808345, "balance_loss_mlp": 1.02776599, "epoch": 0.28624680595220203, "flos": 24970719052800.0, "grad_norm": 1.9409226081942277, "language_loss": 0.7247535, "learning_rate": 3.2447417789658083e-06, "loss": 0.74590206, "num_input_tokens_seen": 102743855, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.5390625, "step": 4761, "time_per_iteration": 3.785780429840088 }, { "auxiliary_loss_clip": 0.010811, "auxiliary_loss_mlp": 0.0103727, "balance_loss_clip": 1.02072382, "balance_loss_mlp": 1.02470469, "epoch": 0.28630692920487, "flos": 22491152449920.0, "grad_norm": 1.8461289578296582, "language_loss": 0.7393499, "learning_rate": 3.244446026728683e-06, "loss": 0.76053357, "num_input_tokens_seen": 102761370, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.5625, "step": 4762, "time_per_iteration": 2.3713278770446777 }, { "auxiliary_loss_clip": 0.010833, "auxiliary_loss_mlp": 0.01036969, "balance_loss_clip": 1.02136397, "balance_loss_mlp": 1.02807164, "epoch": 0.28636705245753796, "flos": 21248872986240.0, "grad_norm": 1.4879677319151083, "language_loss": 0.7620669, "learning_rate": 3.2441502300804803e-06, "loss": 0.78326958, "num_input_tokens_seen": 102780885, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.5546875, "step": 4763, "time_per_iteration": 2.400841474533081 }, { "auxiliary_loss_clip": 0.01081427, "auxiliary_loss_mlp": 0.01035297, "balance_loss_clip": 1.0176661, "balance_loss_mlp": 1.02453864, "epoch": 0.28642717571020593, "flos": 24094678417920.0, "grad_norm": 1.8148746077923816, "language_loss": 0.76722366, "learning_rate": 3.2438543890317557e-06, "loss": 0.78839087, "num_input_tokens_seen": 102801000, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.5703125, "step": 4764, "time_per_iteration": 3.7409722805023193 }, { "auxiliary_loss_clip": 0.01084945, "auxiliary_loss_mlp": 0.01035719, "balance_loss_clip": 1.01578689, "balance_loss_mlp": 1.02711725, "epoch": 0.2864872989628739, "flos": 22600290960000.0, "grad_norm": 1.8969712200285918, "language_loss": 0.70780838, "learning_rate": 3.2435585035930676e-06, "loss": 0.72901499, "num_input_tokens_seen": 102820230, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.578125, "step": 4765, "time_per_iteration": 2.411360740661621 }, { "auxiliary_loss_clip": 0.01078868, "auxiliary_loss_mlp": 0.01027702, "balance_loss_clip": 1.01191878, "balance_loss_mlp": 1.02442336, "epoch": 0.28654742221554186, "flos": 32743677018240.0, "grad_norm": 2.4557909303536225, "language_loss": 0.76024294, "learning_rate": 3.2432625737749754e-06, "loss": 0.78130865, "num_input_tokens_seen": 102842670, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.54296875, "step": 4766, "time_per_iteration": 2.459298849105835 }, { "auxiliary_loss_clip": 0.01081553, "auxiliary_loss_mlp": 0.01033938, "balance_loss_clip": 1.01739168, "balance_loss_mlp": 1.02570593, "epoch": 0.2866075454682098, "flos": 26980354488960.0, "grad_norm": 2.0039010229122143, "language_loss": 0.77118593, "learning_rate": 3.2429665995880397e-06, "loss": 0.79234087, "num_input_tokens_seen": 102864480, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.55859375, "step": 4767, "time_per_iteration": 2.478315830230713 }, { "auxiliary_loss_clip": 0.01082231, "auxiliary_loss_mlp": 0.01037138, "balance_loss_clip": 1.01984024, "balance_loss_mlp": 1.02580881, "epoch": 0.2866676687208778, "flos": 23252852782080.0, "grad_norm": 2.5076167194422685, "language_loss": 0.65450412, "learning_rate": 3.242670581042824e-06, "loss": 0.67569774, "num_input_tokens_seen": 102883740, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.5625, "step": 4768, "time_per_iteration": 2.3790221214294434 }, { "auxiliary_loss_clip": 0.01084885, "auxiliary_loss_mlp": 0.0103863, "balance_loss_clip": 1.02176225, "balance_loss_mlp": 1.02677703, "epoch": 0.28672779197354575, "flos": 21578662488960.0, "grad_norm": 1.9179452799176788, "language_loss": 0.70463544, "learning_rate": 3.2423745181498907e-06, "loss": 0.72587061, "num_input_tokens_seen": 102902945, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.58203125, "step": 4769, "time_per_iteration": 2.3950836658477783 }, { "auxiliary_loss_clip": 0.01082123, "auxiliary_loss_mlp": 0.01029538, "balance_loss_clip": 1.0129323, "balance_loss_mlp": 1.02455568, "epoch": 0.2867879152262137, "flos": 19864531733760.0, "grad_norm": 1.7190212108122154, "language_loss": 0.74997473, "learning_rate": 3.2420784109198076e-06, "loss": 0.77109134, "num_input_tokens_seen": 102922405, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.57421875, "step": 4770, "time_per_iteration": 2.3697502613067627 }, { "auxiliary_loss_clip": 0.01085625, "auxiliary_loss_mlp": 0.01029154, "balance_loss_clip": 1.01197648, "balance_loss_mlp": 1.02735996, "epoch": 0.2868480384788817, "flos": 28212265278720.0, "grad_norm": 2.1302557520463994, "language_loss": 0.67051756, "learning_rate": 3.241782259363141e-06, "loss": 0.69166529, "num_input_tokens_seen": 102938980, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.58203125, "step": 4771, "time_per_iteration": 2.4191195964813232 }, { "auxiliary_loss_clip": 0.01018615, "auxiliary_loss_mlp": 0.01018355, "balance_loss_clip": 1.01679361, "balance_loss_mlp": 1.00506353, "epoch": 0.2869081617315497, "flos": 65421298010880.0, "grad_norm": 0.7819715610590118, "language_loss": 0.56847906, "learning_rate": 3.241486063490459e-06, "loss": 0.58884883, "num_input_tokens_seen": 103000405, "router_z_loss_clip": 0.015625, "router_z_loss_mlp": 0.13476562, "step": 4772, "time_per_iteration": 2.961507558822632 }, { "auxiliary_loss_clip": 0.01083547, "auxiliary_loss_mlp": 0.01032684, "balance_loss_clip": 1.01487398, "balance_loss_mlp": 1.02495086, "epoch": 0.28696828498421767, "flos": 18659748936960.0, "grad_norm": 3.2771513745431937, "language_loss": 0.83096749, "learning_rate": 3.241189823312334e-06, "loss": 0.85212982, "num_input_tokens_seen": 103017970, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.5859375, "step": 4773, "time_per_iteration": 2.374417304992676 }, { "auxiliary_loss_clip": 0.01080541, "auxiliary_loss_mlp": 0.01036948, "balance_loss_clip": 1.01841056, "balance_loss_mlp": 1.02297866, "epoch": 0.28702840823688563, "flos": 23658613136640.0, "grad_norm": 2.0784744402834887, "language_loss": 0.77495211, "learning_rate": 3.2408935388393358e-06, "loss": 0.79612702, "num_input_tokens_seen": 103036385, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.57421875, "step": 4774, "time_per_iteration": 2.3903603553771973 }, { "auxiliary_loss_clip": 0.01080459, "auxiliary_loss_mlp": 0.01034836, "balance_loss_clip": 1.01757431, "balance_loss_mlp": 1.02470946, "epoch": 0.2870885314895536, "flos": 13803993866880.0, "grad_norm": 2.618471709702865, "language_loss": 0.73552686, "learning_rate": 3.24059721008204e-06, "loss": 0.75667977, "num_input_tokens_seen": 103052170, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.5546875, "step": 4775, "time_per_iteration": 2.3653087615966797 }, { "auxiliary_loss_clip": 0.01086239, "auxiliary_loss_mlp": 0.01032819, "balance_loss_clip": 1.01587892, "balance_loss_mlp": 1.02752268, "epoch": 0.28714865474222157, "flos": 17785768072320.0, "grad_norm": 1.634049834127892, "language_loss": 0.8820895, "learning_rate": 3.2403008370510207e-06, "loss": 0.90328014, "num_input_tokens_seen": 103070510, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.5859375, "step": 4776, "time_per_iteration": 2.352720260620117 }, { "auxiliary_loss_clip": 0.01082094, "auxiliary_loss_mlp": 0.0103184, "balance_loss_clip": 1.01487613, "balance_loss_mlp": 1.02560639, "epoch": 0.28720877799488953, "flos": 15996574160640.0, "grad_norm": 1.644757821155631, "language_loss": 0.7422418, "learning_rate": 3.240004419756855e-06, "loss": 0.76338118, "num_input_tokens_seen": 103089590, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.5625, "step": 4777, "time_per_iteration": 2.40632700920105 }, { "auxiliary_loss_clip": 0.01088839, "auxiliary_loss_mlp": 0.01042297, "balance_loss_clip": 1.02309251, "balance_loss_mlp": 1.02881432, "epoch": 0.2872689012475575, "flos": 20922085860480.0, "grad_norm": 5.515723857910451, "language_loss": 0.80392218, "learning_rate": 3.239707958210121e-06, "loss": 0.82523352, "num_input_tokens_seen": 103109080, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.6015625, "step": 4778, "time_per_iteration": 2.372300624847412 }, { "auxiliary_loss_clip": 0.01081172, "auxiliary_loss_mlp": 0.01039664, "balance_loss_clip": 1.02213979, "balance_loss_mlp": 1.02499616, "epoch": 0.28732902450022546, "flos": 21324040876800.0, "grad_norm": 1.4811275860184796, "language_loss": 0.74068809, "learning_rate": 3.239411452421399e-06, "loss": 0.76189649, "num_input_tokens_seen": 103127755, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.5625, "step": 4779, "time_per_iteration": 2.4285295009613037 }, { "auxiliary_loss_clip": 0.01084694, "auxiliary_loss_mlp": 0.01034938, "balance_loss_clip": 1.01655579, "balance_loss_mlp": 1.027174, "epoch": 0.2873891477528934, "flos": 20849326853760.0, "grad_norm": 1.5715025523207715, "language_loss": 0.75808293, "learning_rate": 3.2391149024012705e-06, "loss": 0.77927923, "num_input_tokens_seen": 103147035, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.57421875, "step": 4780, "time_per_iteration": 2.375492572784424 }, { "auxiliary_loss_clip": 0.01081962, "auxiliary_loss_mlp": 0.01041764, "balance_loss_clip": 1.0255754, "balance_loss_mlp": 1.0255785, "epoch": 0.2874492710055614, "flos": 17419110307200.0, "grad_norm": 1.6819876732505463, "language_loss": 0.8117137, "learning_rate": 3.238818308160318e-06, "loss": 0.83295095, "num_input_tokens_seen": 103165410, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.56640625, "step": 4781, "time_per_iteration": 2.373920440673828 }, { "auxiliary_loss_clip": 0.01085892, "auxiliary_loss_mlp": 0.01037457, "balance_loss_clip": 1.01897919, "balance_loss_mlp": 1.02688193, "epoch": 0.28750939425822936, "flos": 13405983834240.0, "grad_norm": 1.9601288546687496, "language_loss": 0.86021197, "learning_rate": 3.2385216697091277e-06, "loss": 0.88144541, "num_input_tokens_seen": 103183710, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.58984375, "step": 4782, "time_per_iteration": 2.3698599338531494 }, { "auxiliary_loss_clip": 0.01084486, "auxiliary_loss_mlp": 0.01033596, "balance_loss_clip": 1.01567316, "balance_loss_mlp": 1.02744699, "epoch": 0.2875695175108973, "flos": 21869000288640.0, "grad_norm": 1.4303310648446173, "language_loss": 0.71079791, "learning_rate": 3.238224987058284e-06, "loss": 0.73197877, "num_input_tokens_seen": 103203790, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.5703125, "step": 4783, "time_per_iteration": 2.4121170043945312 }, { "auxiliary_loss_clip": 0.01083458, "auxiliary_loss_mlp": 0.01035544, "balance_loss_clip": 1.01886714, "balance_loss_mlp": 1.0266844, "epoch": 0.2876296407635653, "flos": 26244385695360.0, "grad_norm": 1.5075065675431283, "language_loss": 0.76762807, "learning_rate": 3.2379282602183757e-06, "loss": 0.78881812, "num_input_tokens_seen": 103223925, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.56640625, "step": 4784, "time_per_iteration": 2.4134573936462402 }, { "auxiliary_loss_clip": 0.01080009, "auxiliary_loss_mlp": 0.01033253, "balance_loss_clip": 1.01619458, "balance_loss_mlp": 1.02529049, "epoch": 0.28768976401623325, "flos": 25372499512320.0, "grad_norm": 1.5139826738496214, "language_loss": 0.75993824, "learning_rate": 3.237631489199993e-06, "loss": 0.78107083, "num_input_tokens_seen": 103244760, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.546875, "step": 4785, "time_per_iteration": 2.434781789779663 }, { "auxiliary_loss_clip": 0.01083871, "auxiliary_loss_mlp": 0.01041697, "balance_loss_clip": 1.02426863, "balance_loss_mlp": 1.02640808, "epoch": 0.28774988726890127, "flos": 30663063054720.0, "grad_norm": 1.9019191870517647, "language_loss": 0.82909286, "learning_rate": 3.2373346740137254e-06, "loss": 0.85034847, "num_input_tokens_seen": 103261995, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.57421875, "step": 4786, "time_per_iteration": 2.4442851543426514 }, { "auxiliary_loss_clip": 0.01084181, "auxiliary_loss_mlp": 0.0103358, "balance_loss_clip": 1.01757026, "balance_loss_mlp": 1.0267905, "epoch": 0.28781001052156924, "flos": 20594391039360.0, "grad_norm": 1.7124788550907577, "language_loss": 0.79891634, "learning_rate": 3.237037814670166e-06, "loss": 0.82009393, "num_input_tokens_seen": 103279780, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5703125, "step": 4787, "time_per_iteration": 2.3960533142089844 }, { "auxiliary_loss_clip": 0.01082349, "auxiliary_loss_mlp": 0.0103384, "balance_loss_clip": 1.01644707, "balance_loss_mlp": 1.02483702, "epoch": 0.2878701337742372, "flos": 26541112273920.0, "grad_norm": 2.5294664105344555, "language_loss": 0.83474612, "learning_rate": 3.2367409111799082e-06, "loss": 0.85590804, "num_input_tokens_seen": 103300580, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.57421875, "step": 4788, "time_per_iteration": 2.4305059909820557 }, { "auxiliary_loss_clip": 0.01085826, "auxiliary_loss_mlp": 0.01037884, "balance_loss_clip": 1.02094471, "balance_loss_mlp": 1.02760315, "epoch": 0.28793025702690517, "flos": 28145615760000.0, "grad_norm": 2.4678657642206128, "language_loss": 0.74098521, "learning_rate": 3.23644396355355e-06, "loss": 0.76222229, "num_input_tokens_seen": 103320430, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.58203125, "step": 4789, "time_per_iteration": 2.4473536014556885 }, { "auxiliary_loss_clip": 0.01079558, "auxiliary_loss_mlp": 0.01037631, "balance_loss_clip": 1.02096558, "balance_loss_mlp": 1.0238378, "epoch": 0.28799038027957313, "flos": 23804340618240.0, "grad_norm": 1.7057604328566531, "language_loss": 0.83811307, "learning_rate": 3.2361469718016867e-06, "loss": 0.859285, "num_input_tokens_seen": 103337695, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.55859375, "step": 4790, "time_per_iteration": 2.4123950004577637 }, { "auxiliary_loss_clip": 0.01084175, "auxiliary_loss_mlp": 0.01035579, "balance_loss_clip": 1.01851463, "balance_loss_mlp": 1.02673221, "epoch": 0.2880505035322411, "flos": 22343085907200.0, "grad_norm": 1.6504365710813103, "language_loss": 0.77442652, "learning_rate": 3.2358499359349177e-06, "loss": 0.79562414, "num_input_tokens_seen": 103357010, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.57421875, "step": 4791, "time_per_iteration": 2.412493944168091 }, { "auxiliary_loss_clip": 0.01084831, "auxiliary_loss_mlp": 0.01033791, "balance_loss_clip": 1.01723278, "balance_loss_mlp": 1.02510417, "epoch": 0.28811062678490906, "flos": 18003277042560.0, "grad_norm": 1.7058950260499903, "language_loss": 0.7090838, "learning_rate": 3.2355528559638436e-06, "loss": 0.73027009, "num_input_tokens_seen": 103375600, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.59765625, "step": 4792, "time_per_iteration": 2.3605997562408447 }, { "auxiliary_loss_clip": 0.01085408, "auxiliary_loss_mlp": 0.01037614, "balance_loss_clip": 1.02001834, "balance_loss_mlp": 1.02661896, "epoch": 0.28817075003757703, "flos": 22089790926720.0, "grad_norm": 1.8470566608293222, "language_loss": 0.78890079, "learning_rate": 3.235255731899066e-06, "loss": 0.81013101, "num_input_tokens_seen": 103395225, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.5859375, "step": 4793, "time_per_iteration": 2.398627519607544 }, { "auxiliary_loss_clip": 0.01081445, "auxiliary_loss_mlp": 0.01034572, "balance_loss_clip": 1.01828766, "balance_loss_mlp": 1.02635789, "epoch": 0.288230873290245, "flos": 41681512229760.0, "grad_norm": 1.5947705443077573, "language_loss": 0.78010929, "learning_rate": 3.2349585637511896e-06, "loss": 0.80126941, "num_input_tokens_seen": 103417245, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.55078125, "step": 4794, "time_per_iteration": 3.9291160106658936 }, { "auxiliary_loss_clip": 0.01083122, "auxiliary_loss_mlp": 0.01044496, "balance_loss_clip": 1.02718687, "balance_loss_mlp": 1.02604651, "epoch": 0.28829099654291296, "flos": 18623439256320.0, "grad_norm": 1.9674666181075553, "language_loss": 0.82565165, "learning_rate": 3.2346613515308176e-06, "loss": 0.84692788, "num_input_tokens_seen": 103435500, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.5703125, "step": 4795, "time_per_iteration": 2.3600006103515625 }, { "auxiliary_loss_clip": 0.0108034, "auxiliary_loss_mlp": 0.01035294, "balance_loss_clip": 1.01966619, "balance_loss_mlp": 1.0264374, "epoch": 0.2883511197955809, "flos": 24673852828800.0, "grad_norm": 1.9384913000177635, "language_loss": 0.74597645, "learning_rate": 3.2343640952485586e-06, "loss": 0.76713276, "num_input_tokens_seen": 103451040, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.5390625, "step": 4796, "time_per_iteration": 2.3823251724243164 }, { "auxiliary_loss_clip": 0.01086263, "auxiliary_loss_mlp": 0.01036379, "balance_loss_clip": 1.01729381, "balance_loss_mlp": 1.0253582, "epoch": 0.2884112430482489, "flos": 23111035372800.0, "grad_norm": 2.4303103980644996, "language_loss": 0.72752434, "learning_rate": 3.23406679491502e-06, "loss": 0.74875081, "num_input_tokens_seen": 103471330, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.609375, "step": 4797, "time_per_iteration": 3.788723945617676 }, { "auxiliary_loss_clip": 0.01081171, "auxiliary_loss_mlp": 0.01035123, "balance_loss_clip": 1.01852298, "balance_loss_mlp": 1.02505696, "epoch": 0.28847136630091685, "flos": 16872405327360.0, "grad_norm": 2.0830388666046225, "language_loss": 0.74375105, "learning_rate": 3.2337694505408117e-06, "loss": 0.76491398, "num_input_tokens_seen": 103488060, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.5625, "step": 4798, "time_per_iteration": 2.3544442653656006 }, { "auxiliary_loss_clip": 0.01083517, "auxiliary_loss_mlp": 0.01040551, "balance_loss_clip": 1.02148914, "balance_loss_mlp": 1.02441955, "epoch": 0.2885314895535849, "flos": 25656588178560.0, "grad_norm": 3.58061133885997, "language_loss": 0.65353239, "learning_rate": 3.2334720621365457e-06, "loss": 0.67477304, "num_input_tokens_seen": 103503600, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.59375, "step": 4799, "time_per_iteration": 2.393225908279419 }, { "auxiliary_loss_clip": 0.01082389, "auxiliary_loss_mlp": 0.01039719, "balance_loss_clip": 1.02271986, "balance_loss_mlp": 1.02558613, "epoch": 0.28859161280625284, "flos": 21106147881600.0, "grad_norm": 2.0266151342269616, "language_loss": 0.82235265, "learning_rate": 3.2331746297128345e-06, "loss": 0.84357375, "num_input_tokens_seen": 103524195, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.5703125, "step": 4800, "time_per_iteration": 2.398787260055542 }, { "auxiliary_loss_clip": 0.01079156, "auxiliary_loss_mlp": 0.01033207, "balance_loss_clip": 1.01694703, "balance_loss_mlp": 1.02517045, "epoch": 0.2886517360589208, "flos": 26468318355840.0, "grad_norm": 2.205300128290066, "language_loss": 0.90891337, "learning_rate": 3.2328771532802934e-06, "loss": 0.93003696, "num_input_tokens_seen": 103545235, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.5390625, "step": 4801, "time_per_iteration": 3.7470779418945312 }, { "auxiliary_loss_clip": 0.01082675, "auxiliary_loss_mlp": 0.01034742, "balance_loss_clip": 1.01758766, "balance_loss_mlp": 1.02651453, "epoch": 0.28871185931158877, "flos": 25264094140800.0, "grad_norm": 5.102993571991013, "language_loss": 0.73611045, "learning_rate": 3.232579632849537e-06, "loss": 0.75728464, "num_input_tokens_seen": 103563305, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.5625, "step": 4802, "time_per_iteration": 2.443101406097412 }, { "auxiliary_loss_clip": 0.01020999, "auxiliary_loss_mlp": 0.01012151, "balance_loss_clip": 1.0105896, "balance_loss_mlp": 1.00733137, "epoch": 0.28877198256425674, "flos": 66662390488320.0, "grad_norm": 0.7838931010390363, "language_loss": 0.63035232, "learning_rate": 3.232282068431185e-06, "loss": 0.65068382, "num_input_tokens_seen": 103625025, "router_z_loss_clip": 0.01556396, "router_z_loss_mlp": 0.13671875, "step": 4803, "time_per_iteration": 2.984715700149536 }, { "auxiliary_loss_clip": 0.01080544, "auxiliary_loss_mlp": 0.01037879, "balance_loss_clip": 1.02194643, "balance_loss_mlp": 1.02412534, "epoch": 0.2888321058169247, "flos": 20301993469440.0, "grad_norm": 1.7477692837910497, "language_loss": 0.70655793, "learning_rate": 3.2319844600358554e-06, "loss": 0.72774214, "num_input_tokens_seen": 103644235, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.56640625, "step": 4804, "time_per_iteration": 3.8123722076416016 }, { "auxiliary_loss_clip": 0.01083416, "auxiliary_loss_mlp": 0.01037688, "balance_loss_clip": 1.01964021, "balance_loss_mlp": 1.02464998, "epoch": 0.28889222906959267, "flos": 25515643553280.0, "grad_norm": 2.1381345292100202, "language_loss": 0.68044317, "learning_rate": 3.231686807674169e-06, "loss": 0.70165426, "num_input_tokens_seen": 103664700, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.5859375, "step": 4805, "time_per_iteration": 2.402233839035034 }, { "auxiliary_loss_clip": 0.01081487, "auxiliary_loss_mlp": 0.01031944, "balance_loss_clip": 1.01514745, "balance_loss_mlp": 1.02434111, "epoch": 0.28895235232226063, "flos": 32669940493440.0, "grad_norm": 1.3754379833743893, "language_loss": 0.69341135, "learning_rate": 3.2313891113567496e-06, "loss": 0.71454567, "num_input_tokens_seen": 103686595, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.5703125, "step": 4806, "time_per_iteration": 2.4646334648132324 }, { "auxiliary_loss_clip": 0.01079249, "auxiliary_loss_mlp": 0.01036407, "balance_loss_clip": 1.02004588, "balance_loss_mlp": 1.02471232, "epoch": 0.2890124755749286, "flos": 29713425540480.0, "grad_norm": 1.5752426751402027, "language_loss": 0.71645749, "learning_rate": 3.2310913710942193e-06, "loss": 0.73761404, "num_input_tokens_seen": 103707525, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.546875, "step": 4807, "time_per_iteration": 2.4353389739990234 }, { "auxiliary_loss_clip": 0.01081557, "auxiliary_loss_mlp": 0.01030749, "balance_loss_clip": 1.01435733, "balance_loss_mlp": 1.02440739, "epoch": 0.28907259882759656, "flos": 22673364168960.0, "grad_norm": 1.82325769433667, "language_loss": 0.81497622, "learning_rate": 3.2307935868972055e-06, "loss": 0.83609927, "num_input_tokens_seen": 103727905, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.5703125, "step": 4808, "time_per_iteration": 2.391050100326538 }, { "auxiliary_loss_clip": 0.01078893, "auxiliary_loss_mlp": 0.01040459, "balance_loss_clip": 1.02381706, "balance_loss_mlp": 1.02490616, "epoch": 0.2891327220802645, "flos": 22564923886080.0, "grad_norm": 1.4461880787855734, "language_loss": 0.78218162, "learning_rate": 3.2304957587763344e-06, "loss": 0.80337512, "num_input_tokens_seen": 103748335, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.5390625, "step": 4809, "time_per_iteration": 2.385453701019287 }, { "auxiliary_loss_clip": 0.01084992, "auxiliary_loss_mlp": 0.01040933, "balance_loss_clip": 1.02265799, "balance_loss_mlp": 1.0242722, "epoch": 0.2891928453329325, "flos": 21651735697920.0, "grad_norm": 1.7445743669993674, "language_loss": 0.7866075, "learning_rate": 3.2301978867422352e-06, "loss": 0.80786681, "num_input_tokens_seen": 103767020, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.609375, "step": 4810, "time_per_iteration": 2.392239570617676 }, { "auxiliary_loss_clip": 0.01080455, "auxiliary_loss_mlp": 0.01030121, "balance_loss_clip": 1.01421237, "balance_loss_mlp": 1.02487373, "epoch": 0.28925296858560046, "flos": 23220976844160.0, "grad_norm": 1.727744313539586, "language_loss": 0.76764137, "learning_rate": 3.2298999708055375e-06, "loss": 0.78874707, "num_input_tokens_seen": 103786355, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.5546875, "step": 4811, "time_per_iteration": 2.3839938640594482 }, { "auxiliary_loss_clip": 0.01079242, "auxiliary_loss_mlp": 0.01039131, "balance_loss_clip": 1.02209604, "balance_loss_mlp": 1.02362239, "epoch": 0.2893130918382685, "flos": 28620399605760.0, "grad_norm": 1.4053197116203917, "language_loss": 0.77415568, "learning_rate": 3.229602010976873e-06, "loss": 0.79533941, "num_input_tokens_seen": 103809345, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.5546875, "step": 4812, "time_per_iteration": 2.4538538455963135 }, { "auxiliary_loss_clip": 0.0101508, "auxiliary_loss_mlp": 0.01020411, "balance_loss_clip": 1.01896822, "balance_loss_mlp": 1.00233889, "epoch": 0.28937321509093644, "flos": 72297615772800.0, "grad_norm": 0.8436769741053833, "language_loss": 0.60269272, "learning_rate": 3.2293040072668768e-06, "loss": 0.62304771, "num_input_tokens_seen": 103871180, "router_z_loss_clip": 0.0144043, "router_z_loss_mlp": 0.12695312, "step": 4813, "time_per_iteration": 3.167846441268921 }, { "auxiliary_loss_clip": 0.0107974, "auxiliary_loss_mlp": 0.01031823, "balance_loss_clip": 1.01550364, "balance_loss_mlp": 1.02399731, "epoch": 0.2894333383436044, "flos": 16215479585280.0, "grad_norm": 2.7462003041272225, "language_loss": 0.82181168, "learning_rate": 3.229005959686182e-06, "loss": 0.84292722, "num_input_tokens_seen": 103889040, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.55859375, "step": 4814, "time_per_iteration": 2.3672327995300293 }, { "auxiliary_loss_clip": 0.01085028, "auxiliary_loss_mlp": 0.01042757, "balance_loss_clip": 1.02534032, "balance_loss_mlp": 1.02587438, "epoch": 0.2894934615962724, "flos": 24827086252800.0, "grad_norm": 1.5383767770677812, "language_loss": 0.72341979, "learning_rate": 3.2287078682454255e-06, "loss": 0.74469769, "num_input_tokens_seen": 103910380, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.59375, "step": 4815, "time_per_iteration": 2.4240670204162598 }, { "auxiliary_loss_clip": 0.01078624, "auxiliary_loss_mlp": 0.01037028, "balance_loss_clip": 1.02207923, "balance_loss_mlp": 1.02578044, "epoch": 0.28955358484894034, "flos": 20448907937280.0, "grad_norm": 1.3841100290299935, "language_loss": 0.70090729, "learning_rate": 3.2284097329552465e-06, "loss": 0.72206384, "num_input_tokens_seen": 103929955, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.52734375, "step": 4816, "time_per_iteration": 2.4304616451263428 }, { "auxiliary_loss_clip": 0.01085138, "auxiliary_loss_mlp": 0.01034884, "balance_loss_clip": 1.01666856, "balance_loss_mlp": 1.02484918, "epoch": 0.2896137081016083, "flos": 22564086013440.0, "grad_norm": 2.0496427179592764, "language_loss": 0.73847157, "learning_rate": 3.2281115538262844e-06, "loss": 0.75967181, "num_input_tokens_seen": 103948020, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.6015625, "step": 4817, "time_per_iteration": 2.4246084690093994 }, { "auxiliary_loss_clip": 0.01085699, "auxiliary_loss_mlp": 0.01038941, "balance_loss_clip": 1.02074957, "balance_loss_mlp": 1.02650654, "epoch": 0.28967383135427627, "flos": 26686735021440.0, "grad_norm": 1.6915612583310022, "language_loss": 0.76172686, "learning_rate": 3.227813330869179e-06, "loss": 0.78297329, "num_input_tokens_seen": 103968740, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.59375, "step": 4818, "time_per_iteration": 2.442655324935913 }, { "auxiliary_loss_clip": 0.01083231, "auxiliary_loss_mlp": 0.0103029, "balance_loss_clip": 1.01321888, "balance_loss_mlp": 1.0253706, "epoch": 0.28973395460694423, "flos": 15557401768320.0, "grad_norm": 1.8303846110340454, "language_loss": 0.79493523, "learning_rate": 3.2275150640945742e-06, "loss": 0.81607044, "num_input_tokens_seen": 103986005, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.578125, "step": 4819, "time_per_iteration": 2.3795225620269775 }, { "auxiliary_loss_clip": 0.01086843, "auxiliary_loss_mlp": 0.01036769, "balance_loss_clip": 1.01699209, "balance_loss_mlp": 1.02553666, "epoch": 0.2897940778596122, "flos": 18696477553920.0, "grad_norm": 1.896152999150647, "language_loss": 0.78887463, "learning_rate": 3.227216753513115e-06, "loss": 0.81011081, "num_input_tokens_seen": 104005070, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.609375, "step": 4820, "time_per_iteration": 2.36979603767395 }, { "auxiliary_loss_clip": 0.01086203, "auxiliary_loss_mlp": 0.01031562, "balance_loss_clip": 1.01493227, "balance_loss_mlp": 1.02753949, "epoch": 0.28985420111228016, "flos": 18769306383360.0, "grad_norm": 2.1469533656701505, "language_loss": 0.7271353, "learning_rate": 3.2269183991354464e-06, "loss": 0.74831295, "num_input_tokens_seen": 104022945, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.5859375, "step": 4821, "time_per_iteration": 2.354018211364746 }, { "auxiliary_loss_clip": 0.0108335, "auxiliary_loss_mlp": 0.01032665, "balance_loss_clip": 1.01625037, "balance_loss_mlp": 1.02552605, "epoch": 0.28991432436494813, "flos": 23068895495040.0, "grad_norm": 1.7789812989965386, "language_loss": 0.72078979, "learning_rate": 3.226620000972216e-06, "loss": 0.74194992, "num_input_tokens_seen": 104042080, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.578125, "step": 4822, "time_per_iteration": 2.399756669998169 }, { "auxiliary_loss_clip": 0.01082567, "auxiliary_loss_mlp": 0.01037314, "balance_loss_clip": 1.02093506, "balance_loss_mlp": 1.02557909, "epoch": 0.2899744476176161, "flos": 17602229721600.0, "grad_norm": 1.6526940471716602, "language_loss": 0.66189766, "learning_rate": 3.2263215590340726e-06, "loss": 0.68309653, "num_input_tokens_seen": 104060975, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.5703125, "step": 4823, "time_per_iteration": 2.3464319705963135 }, { "auxiliary_loss_clip": 0.01081034, "auxiliary_loss_mlp": 0.01032701, "balance_loss_clip": 1.01583338, "balance_loss_mlp": 1.02479911, "epoch": 0.29003457087028406, "flos": 22308277415040.0, "grad_norm": 2.2258522926954902, "language_loss": 0.81182373, "learning_rate": 3.2260230733316683e-06, "loss": 0.83296108, "num_input_tokens_seen": 104081395, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.5625, "step": 4824, "time_per_iteration": 2.3960704803466797 }, { "auxiliary_loss_clip": 0.0108277, "auxiliary_loss_mlp": 0.01038686, "balance_loss_clip": 1.02033961, "balance_loss_mlp": 1.02461863, "epoch": 0.2900946941229521, "flos": 21943888888320.0, "grad_norm": 2.0599624089569186, "language_loss": 0.7224611, "learning_rate": 3.2257245438756534e-06, "loss": 0.74367565, "num_input_tokens_seen": 104099995, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.58203125, "step": 4825, "time_per_iteration": 2.3737518787384033 }, { "auxiliary_loss_clip": 0.01085591, "auxiliary_loss_mlp": 0.01030159, "balance_loss_clip": 1.01271844, "balance_loss_mlp": 1.02707791, "epoch": 0.29015481737562004, "flos": 17931181351680.0, "grad_norm": 2.2898350457481857, "language_loss": 0.73085475, "learning_rate": 3.2254259706766824e-06, "loss": 0.75201225, "num_input_tokens_seen": 104118930, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.5859375, "step": 4826, "time_per_iteration": 2.3637855052948 }, { "auxiliary_loss_clip": 0.01080506, "auxiliary_loss_mlp": 0.01031808, "balance_loss_clip": 1.01483226, "balance_loss_mlp": 1.02390599, "epoch": 0.290214940628288, "flos": 22782432856320.0, "grad_norm": 3.359252595343814, "language_loss": 0.68858981, "learning_rate": 3.2251273537454113e-06, "loss": 0.70971298, "num_input_tokens_seen": 104136940, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.56640625, "step": 4827, "time_per_iteration": 2.3695380687713623 }, { "auxiliary_loss_clip": 0.01084042, "auxiliary_loss_mlp": 0.01035897, "balance_loss_clip": 1.01819396, "balance_loss_mlp": 1.0265789, "epoch": 0.290275063880956, "flos": 20005581093120.0, "grad_norm": 1.824959284257807, "language_loss": 0.80134833, "learning_rate": 3.224828693092496e-06, "loss": 0.82254779, "num_input_tokens_seen": 104154280, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.57421875, "step": 4828, "time_per_iteration": 2.381049871444702 }, { "auxiliary_loss_clip": 0.01082451, "auxiliary_loss_mlp": 0.0104034, "balance_loss_clip": 1.02346587, "balance_loss_mlp": 1.02582574, "epoch": 0.29033518713362394, "flos": 22052538639360.0, "grad_norm": 1.859359626107, "language_loss": 0.80581927, "learning_rate": 3.2245299887285954e-06, "loss": 0.82704711, "num_input_tokens_seen": 104172605, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.56640625, "step": 4829, "time_per_iteration": 2.3636674880981445 }, { "auxiliary_loss_clip": 0.01082061, "auxiliary_loss_mlp": 0.01032015, "balance_loss_clip": 1.01545668, "balance_loss_mlp": 1.02618432, "epoch": 0.2903953103862919, "flos": 25628866692480.0, "grad_norm": 1.6939863258155763, "language_loss": 0.82723534, "learning_rate": 3.224231240664369e-06, "loss": 0.84837615, "num_input_tokens_seen": 104194120, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.55859375, "step": 4830, "time_per_iteration": 2.428407907485962 }, { "auxiliary_loss_clip": 0.01083135, "auxiliary_loss_mlp": 0.010374, "balance_loss_clip": 1.02028179, "balance_loss_mlp": 1.02500689, "epoch": 0.29045543363895987, "flos": 16944919954560.0, "grad_norm": 2.582446349843684, "language_loss": 0.79058111, "learning_rate": 3.223932448910479e-06, "loss": 0.81178641, "num_input_tokens_seen": 104210875, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.58203125, "step": 4831, "time_per_iteration": 2.3361928462982178 }, { "auxiliary_loss_clip": 0.01078225, "auxiliary_loss_mlp": 0.01034196, "balance_loss_clip": 1.01888967, "balance_loss_mlp": 1.02458739, "epoch": 0.29051555689162784, "flos": 26394302540160.0, "grad_norm": 1.6636560669260019, "language_loss": 0.74149847, "learning_rate": 3.2236336134775883e-06, "loss": 0.76262271, "num_input_tokens_seen": 104229875, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.53515625, "step": 4832, "time_per_iteration": 2.414759874343872 }, { "auxiliary_loss_clip": 0.01080248, "auxiliary_loss_mlp": 0.01028424, "balance_loss_clip": 1.01357675, "balance_loss_mlp": 1.02451015, "epoch": 0.2905756801442958, "flos": 21102866213760.0, "grad_norm": 1.6261987899159107, "language_loss": 0.76170707, "learning_rate": 3.2233347343763614e-06, "loss": 0.78279382, "num_input_tokens_seen": 104250405, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.5546875, "step": 4833, "time_per_iteration": 3.8914058208465576 }, { "auxiliary_loss_clip": 0.01078607, "auxiliary_loss_mlp": 0.01032211, "balance_loss_clip": 1.01640415, "balance_loss_mlp": 1.02417588, "epoch": 0.29063580339696377, "flos": 15705154108800.0, "grad_norm": 1.7451189603754897, "language_loss": 0.6479373, "learning_rate": 3.2230358116174645e-06, "loss": 0.66904545, "num_input_tokens_seen": 104269185, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.546875, "step": 4834, "time_per_iteration": 2.359287977218628 }, { "auxiliary_loss_clip": 0.01077039, "auxiliary_loss_mlp": 0.01025505, "balance_loss_clip": 1.00991201, "balance_loss_mlp": 1.02370596, "epoch": 0.29069592664963173, "flos": 24643827192960.0, "grad_norm": 1.6971943668901526, "language_loss": 0.71625978, "learning_rate": 3.2227368452115658e-06, "loss": 0.73728526, "num_input_tokens_seen": 104289400, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.53515625, "step": 4835, "time_per_iteration": 2.399034023284912 }, { "auxiliary_loss_clip": 0.01080264, "auxiliary_loss_mlp": 0.01034315, "balance_loss_clip": 1.01929474, "balance_loss_mlp": 1.02554822, "epoch": 0.2907560499022997, "flos": 24972569354880.0, "grad_norm": 1.5955799720644368, "language_loss": 0.79172188, "learning_rate": 3.2224378351693337e-06, "loss": 0.8128677, "num_input_tokens_seen": 104310485, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.546875, "step": 4836, "time_per_iteration": 3.7992281913757324 }, { "auxiliary_loss_clip": 0.01078332, "auxiliary_loss_mlp": 0.01033185, "balance_loss_clip": 1.01762795, "balance_loss_mlp": 1.02478957, "epoch": 0.29081617315496766, "flos": 18656606914560.0, "grad_norm": 1.6570337145681453, "language_loss": 0.80692613, "learning_rate": 3.2221387815014405e-06, "loss": 0.82804132, "num_input_tokens_seen": 104327330, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.53515625, "step": 4837, "time_per_iteration": 2.3469960689544678 }, { "auxiliary_loss_clip": 0.01079001, "auxiliary_loss_mlp": 0.0103406, "balance_loss_clip": 1.01747775, "balance_loss_mlp": 1.0222944, "epoch": 0.2908762964076356, "flos": 35329693956480.0, "grad_norm": 1.8333048825527871, "language_loss": 0.6711567, "learning_rate": 3.2218396842185576e-06, "loss": 0.69228733, "num_input_tokens_seen": 104350350, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.5703125, "step": 4838, "time_per_iteration": 2.5223610401153564 }, { "auxiliary_loss_clip": 0.01084103, "auxiliary_loss_mlp": 0.01037505, "balance_loss_clip": 1.01902723, "balance_loss_mlp": 1.02553546, "epoch": 0.29093641966030365, "flos": 23075179539840.0, "grad_norm": 1.6165504959692143, "language_loss": 0.71655321, "learning_rate": 3.2215405433313595e-06, "loss": 0.73776925, "num_input_tokens_seen": 104369995, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.5859375, "step": 4839, "time_per_iteration": 2.3829758167266846 }, { "auxiliary_loss_clip": 0.01079707, "auxiliary_loss_mlp": 0.01030924, "balance_loss_clip": 1.01615405, "balance_loss_mlp": 1.02491629, "epoch": 0.2909965429129716, "flos": 35953940799360.0, "grad_norm": 1.795211431657631, "language_loss": 0.76032734, "learning_rate": 3.221241358850521e-06, "loss": 0.78143358, "num_input_tokens_seen": 104392285, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.546875, "step": 4840, "time_per_iteration": 3.8981010913848877 }, { "auxiliary_loss_clip": 0.01081852, "auxiliary_loss_mlp": 0.01038773, "balance_loss_clip": 1.02273917, "balance_loss_mlp": 1.02586091, "epoch": 0.2910566661656396, "flos": 30879001013760.0, "grad_norm": 1.7548457654858056, "language_loss": 0.60732472, "learning_rate": 3.2209421307867205e-06, "loss": 0.62853098, "num_input_tokens_seen": 104412640, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.55859375, "step": 4841, "time_per_iteration": 2.4533472061157227 }, { "auxiliary_loss_clip": 0.01080434, "auxiliary_loss_mlp": 0.01034082, "balance_loss_clip": 1.0171665, "balance_loss_mlp": 1.0246284, "epoch": 0.29111678941830754, "flos": 30008825487360.0, "grad_norm": 1.4024801716579982, "language_loss": 0.71260989, "learning_rate": 3.2206428591506358e-06, "loss": 0.73375505, "num_input_tokens_seen": 104435245, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.55859375, "step": 4842, "time_per_iteration": 2.4548144340515137 }, { "auxiliary_loss_clip": 0.01078416, "auxiliary_loss_mlp": 0.01034786, "balance_loss_clip": 1.01953864, "balance_loss_mlp": 1.02398825, "epoch": 0.2911769126709755, "flos": 22856274115200.0, "grad_norm": 1.6124706220855658, "language_loss": 0.72992384, "learning_rate": 3.220343543952947e-06, "loss": 0.75105584, "num_input_tokens_seen": 104455395, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.546875, "step": 4843, "time_per_iteration": 3.7670974731445312 }, { "auxiliary_loss_clip": 0.01078737, "auxiliary_loss_mlp": 0.01032062, "balance_loss_clip": 1.01528931, "balance_loss_mlp": 1.02261329, "epoch": 0.2912370359236435, "flos": 21649501370880.0, "grad_norm": 2.592727729122953, "language_loss": 0.58053476, "learning_rate": 3.2200441852043367e-06, "loss": 0.60164273, "num_input_tokens_seen": 104473350, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.5625, "step": 4844, "time_per_iteration": 2.3729407787323 }, { "auxiliary_loss_clip": 0.01085042, "auxiliary_loss_mlp": 0.010366, "balance_loss_clip": 1.01942754, "balance_loss_mlp": 1.02686727, "epoch": 0.29129715917631144, "flos": 22892234682240.0, "grad_norm": 2.1923817293059846, "language_loss": 0.86279273, "learning_rate": 3.2197447829154875e-06, "loss": 0.88400924, "num_input_tokens_seen": 104492265, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.58203125, "step": 4845, "time_per_iteration": 2.4020376205444336 }, { "auxiliary_loss_clip": 0.01081456, "auxiliary_loss_mlp": 0.01039662, "balance_loss_clip": 1.02263904, "balance_loss_mlp": 1.02454257, "epoch": 0.2913572824289794, "flos": 22673364168960.0, "grad_norm": 1.8233020624281395, "language_loss": 0.6660794, "learning_rate": 3.2194453370970844e-06, "loss": 0.68729067, "num_input_tokens_seen": 104510755, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.5703125, "step": 4846, "time_per_iteration": 2.383287191390991 }, { "auxiliary_loss_clip": 0.01080792, "auxiliary_loss_mlp": 0.01030144, "balance_loss_clip": 1.0143373, "balance_loss_mlp": 1.02626097, "epoch": 0.29141740568164737, "flos": 23106427073280.0, "grad_norm": 2.9322882141661952, "language_loss": 0.70153689, "learning_rate": 3.219145847759814e-06, "loss": 0.72264624, "num_input_tokens_seen": 104530830, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.54296875, "step": 4847, "time_per_iteration": 2.4253220558166504 }, { "auxiliary_loss_clip": 0.01080213, "auxiliary_loss_mlp": 0.01034993, "balance_loss_clip": 1.01851881, "balance_loss_mlp": 1.02504373, "epoch": 0.29147752893431533, "flos": 23585889041280.0, "grad_norm": 1.5276903295987276, "language_loss": 0.74004263, "learning_rate": 3.218846314914365e-06, "loss": 0.76119471, "num_input_tokens_seen": 104550115, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.55078125, "step": 4848, "time_per_iteration": 2.3964364528656006 }, { "auxiliary_loss_clip": 0.01080892, "auxiliary_loss_mlp": 0.01032397, "balance_loss_clip": 1.01500416, "balance_loss_mlp": 1.02441859, "epoch": 0.2915376521869833, "flos": 20591004637440.0, "grad_norm": 2.0042220062771046, "language_loss": 0.76787412, "learning_rate": 3.218546738571425e-06, "loss": 0.78900695, "num_input_tokens_seen": 104566255, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.5625, "step": 4849, "time_per_iteration": 2.3720860481262207 }, { "auxiliary_loss_clip": 0.01083458, "auxiliary_loss_mlp": 0.01031891, "balance_loss_clip": 1.01480842, "balance_loss_mlp": 1.02582693, "epoch": 0.29159777543965126, "flos": 20810503555200.0, "grad_norm": 1.745789091985758, "language_loss": 0.78179145, "learning_rate": 3.2182471187416874e-06, "loss": 0.80294496, "num_input_tokens_seen": 104585235, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.578125, "step": 4850, "time_per_iteration": 2.3725900650024414 }, { "auxiliary_loss_clip": 0.01081255, "auxiliary_loss_mlp": 0.010333, "balance_loss_clip": 1.01680112, "balance_loss_mlp": 1.02514887, "epoch": 0.29165789869231923, "flos": 24242989340160.0, "grad_norm": 1.9998012325708516, "language_loss": 0.75585085, "learning_rate": 3.2179474554358438e-06, "loss": 0.77699637, "num_input_tokens_seen": 104605315, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.5625, "step": 4851, "time_per_iteration": 2.413799285888672 }, { "auxiliary_loss_clip": 0.01079551, "auxiliary_loss_mlp": 0.0103613, "balance_loss_clip": 1.02114487, "balance_loss_mlp": 1.02531195, "epoch": 0.29171802194498725, "flos": 28948653008640.0, "grad_norm": 1.390250495806322, "language_loss": 0.77273381, "learning_rate": 3.2176477486645883e-06, "loss": 0.79389066, "num_input_tokens_seen": 104626055, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.54296875, "step": 4852, "time_per_iteration": 2.4361729621887207 }, { "auxiliary_loss_clip": 0.01079641, "auxiliary_loss_mlp": 0.01039729, "balance_loss_clip": 1.0230999, "balance_loss_mlp": 1.02406645, "epoch": 0.2917781451976552, "flos": 22597218760320.0, "grad_norm": 1.530315160990392, "language_loss": 0.77832162, "learning_rate": 3.2173479984386165e-06, "loss": 0.79951537, "num_input_tokens_seen": 104646005, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.5546875, "step": 4853, "time_per_iteration": 2.4366989135742188 }, { "auxiliary_loss_clip": 0.0108095, "auxiliary_loss_mlp": 0.01033931, "balance_loss_clip": 1.01707482, "balance_loss_mlp": 1.02429318, "epoch": 0.2918382684503232, "flos": 21573530519040.0, "grad_norm": 2.513343578178358, "language_loss": 0.88420606, "learning_rate": 3.217048204768626e-06, "loss": 0.90535486, "num_input_tokens_seen": 104661620, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.56640625, "step": 4854, "time_per_iteration": 2.348270893096924 }, { "auxiliary_loss_clip": 0.01082857, "auxiliary_loss_mlp": 0.01041459, "balance_loss_clip": 1.02329183, "balance_loss_mlp": 1.02627993, "epoch": 0.29189839170299114, "flos": 24352337318400.0, "grad_norm": 1.7781811804924719, "language_loss": 0.86646366, "learning_rate": 3.2167483676653167e-06, "loss": 0.88770688, "num_input_tokens_seen": 104681445, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.56640625, "step": 4855, "time_per_iteration": 2.4230921268463135 }, { "auxiliary_loss_clip": 0.0102273, "auxiliary_loss_mlp": 0.01013921, "balance_loss_clip": 1.01213312, "balance_loss_mlp": 1.01027477, "epoch": 0.2919585149556591, "flos": 71313065032320.0, "grad_norm": 0.8010861439518471, "language_loss": 0.60175705, "learning_rate": 3.216448487139387e-06, "loss": 0.6221236, "num_input_tokens_seen": 104747945, "router_z_loss_clip": 0.01782227, "router_z_loss_mlp": 0.12451172, "step": 4856, "time_per_iteration": 3.1278514862060547 }, { "auxiliary_loss_clip": 0.01079198, "auxiliary_loss_mlp": 0.01031544, "balance_loss_clip": 1.01648211, "balance_loss_mlp": 1.02405477, "epoch": 0.2920186382083271, "flos": 15632290368000.0, "grad_norm": 2.1411161978615634, "language_loss": 0.68328226, "learning_rate": 3.2161485632015397e-06, "loss": 0.70438963, "num_input_tokens_seen": 104766225, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.55078125, "step": 4857, "time_per_iteration": 2.403705358505249 }, { "auxiliary_loss_clip": 0.01081421, "auxiliary_loss_mlp": 0.01033698, "balance_loss_clip": 1.01780152, "balance_loss_mlp": 1.02615559, "epoch": 0.29207876146099504, "flos": 28364765564160.0, "grad_norm": 1.9635585952318306, "language_loss": 0.84004754, "learning_rate": 3.2158485958624794e-06, "loss": 0.86119872, "num_input_tokens_seen": 104785345, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.5546875, "step": 4858, "time_per_iteration": 2.444014549255371 }, { "auxiliary_loss_clip": 0.01082481, "auxiliary_loss_mlp": 0.01031619, "balance_loss_clip": 1.01531112, "balance_loss_mlp": 1.02627563, "epoch": 0.292138884713663, "flos": 21869907984000.0, "grad_norm": 1.8162958687013442, "language_loss": 0.77836138, "learning_rate": 3.2155485851329095e-06, "loss": 0.79950237, "num_input_tokens_seen": 104804560, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.5625, "step": 4859, "time_per_iteration": 2.398045539855957 }, { "auxiliary_loss_clip": 0.01084891, "auxiliary_loss_mlp": 0.01036773, "balance_loss_clip": 1.01970196, "balance_loss_mlp": 1.02530432, "epoch": 0.29219900796633097, "flos": 20991598110720.0, "grad_norm": 3.12915696884728, "language_loss": 0.68915278, "learning_rate": 3.215248531023538e-06, "loss": 0.71036941, "num_input_tokens_seen": 104821105, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.59765625, "step": 4860, "time_per_iteration": 2.3619866371154785 }, { "auxiliary_loss_clip": 0.01081391, "auxiliary_loss_mlp": 0.01036311, "balance_loss_clip": 1.0205512, "balance_loss_mlp": 1.02690506, "epoch": 0.29225913121899894, "flos": 35003221032960.0, "grad_norm": 2.084397699268929, "language_loss": 0.75611734, "learning_rate": 3.2149484335450722e-06, "loss": 0.7772944, "num_input_tokens_seen": 104841440, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.546875, "step": 4861, "time_per_iteration": 2.5306544303894043 }, { "auxiliary_loss_clip": 0.01079733, "auxiliary_loss_mlp": 0.01039709, "balance_loss_clip": 1.02440238, "balance_loss_mlp": 1.02536917, "epoch": 0.2923192544716669, "flos": 13514843053440.0, "grad_norm": 1.6225731800692211, "language_loss": 0.90992594, "learning_rate": 3.2146482927082216e-06, "loss": 0.93112034, "num_input_tokens_seen": 104858210, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.54296875, "step": 4862, "time_per_iteration": 2.3501853942871094 }, { "auxiliary_loss_clip": 0.01081327, "auxiliary_loss_mlp": 0.0103225, "balance_loss_clip": 1.01696754, "balance_loss_mlp": 1.02506638, "epoch": 0.29237937772433487, "flos": 19462506894720.0, "grad_norm": 2.25288590517054, "language_loss": 0.73321408, "learning_rate": 3.214348108523698e-06, "loss": 0.75434983, "num_input_tokens_seen": 104875620, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.5625, "step": 4863, "time_per_iteration": 2.378408908843994 }, { "auxiliary_loss_clip": 0.01077311, "auxiliary_loss_mlp": 0.01030392, "balance_loss_clip": 1.01527655, "balance_loss_mlp": 1.02555752, "epoch": 0.29243950097700283, "flos": 20849536321920.0, "grad_norm": 1.7460244344959828, "language_loss": 0.7778933, "learning_rate": 3.214047881002214e-06, "loss": 0.79897034, "num_input_tokens_seen": 104894600, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.515625, "step": 4864, "time_per_iteration": 2.360659599304199 }, { "auxiliary_loss_clip": 0.01083728, "auxiliary_loss_mlp": 0.01036639, "balance_loss_clip": 1.01857841, "balance_loss_mlp": 1.02650046, "epoch": 0.29249962422967085, "flos": 23583165955200.0, "grad_norm": 5.269798209451896, "language_loss": 0.8133713, "learning_rate": 3.2137476101544848e-06, "loss": 0.83457494, "num_input_tokens_seen": 104914530, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.57421875, "step": 4865, "time_per_iteration": 2.4074978828430176 }, { "auxiliary_loss_clip": 0.01082623, "auxiliary_loss_mlp": 0.01031604, "balance_loss_clip": 1.01410413, "balance_loss_mlp": 1.02608252, "epoch": 0.2925597474823388, "flos": 22272247025280.0, "grad_norm": 1.7860263103481007, "language_loss": 0.85093796, "learning_rate": 3.213447295991225e-06, "loss": 0.87208021, "num_input_tokens_seen": 104933460, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.5625, "step": 4866, "time_per_iteration": 2.3721225261688232 }, { "auxiliary_loss_clip": 0.01077431, "auxiliary_loss_mlp": 0.01029499, "balance_loss_clip": 1.01423407, "balance_loss_mlp": 1.02409363, "epoch": 0.2926198707350068, "flos": 34454770485120.0, "grad_norm": 1.8350695441760498, "language_loss": 0.75842911, "learning_rate": 3.2131469385231525e-06, "loss": 0.77949834, "num_input_tokens_seen": 104954495, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.53515625, "step": 4867, "time_per_iteration": 2.498326539993286 }, { "auxiliary_loss_clip": 0.01082298, "auxiliary_loss_mlp": 0.0103905, "balance_loss_clip": 1.02264643, "balance_loss_mlp": 1.02520621, "epoch": 0.29267999398767475, "flos": 20703110613120.0, "grad_norm": 1.8880790595271757, "language_loss": 0.73381352, "learning_rate": 3.212846537760986e-06, "loss": 0.75502706, "num_input_tokens_seen": 104971915, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.5703125, "step": 4868, "time_per_iteration": 2.372096300125122 }, { "auxiliary_loss_clip": 0.01080101, "auxiliary_loss_mlp": 0.01028783, "balance_loss_clip": 1.01271343, "balance_loss_mlp": 1.02560258, "epoch": 0.2927401172403427, "flos": 18367700480640.0, "grad_norm": 1.4892387846479225, "language_loss": 0.74384058, "learning_rate": 3.212546093715447e-06, "loss": 0.76492941, "num_input_tokens_seen": 104991335, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.546875, "step": 4869, "time_per_iteration": 2.385347366333008 }, { "auxiliary_loss_clip": 0.01081931, "auxiliary_loss_mlp": 0.01029954, "balance_loss_clip": 1.01398015, "balance_loss_mlp": 1.02586901, "epoch": 0.2928002404930107, "flos": 26102847576960.0, "grad_norm": 1.514749176376369, "language_loss": 0.76660168, "learning_rate": 3.2122456063972567e-06, "loss": 0.78772056, "num_input_tokens_seen": 105012015, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5625, "step": 4870, "time_per_iteration": 2.4170315265655518 }, { "auxiliary_loss_clip": 0.01085061, "auxiliary_loss_mlp": 0.0103873, "balance_loss_clip": 1.02070534, "balance_loss_mlp": 1.02652979, "epoch": 0.29286036374567864, "flos": 21323656851840.0, "grad_norm": 1.9955823115410365, "language_loss": 0.67672682, "learning_rate": 3.2119450758171393e-06, "loss": 0.69796467, "num_input_tokens_seen": 105031460, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.5859375, "step": 4871, "time_per_iteration": 2.3917009830474854 }, { "auxiliary_loss_clip": 0.0107787, "auxiliary_loss_mlp": 0.01033046, "balance_loss_clip": 1.01746511, "balance_loss_mlp": 1.02306676, "epoch": 0.2929204869983466, "flos": 29568221729280.0, "grad_norm": 1.8462850604659118, "language_loss": 0.77092016, "learning_rate": 3.2116445019858196e-06, "loss": 0.79202926, "num_input_tokens_seen": 105052965, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.546875, "step": 4872, "time_per_iteration": 2.4423136711120605 }, { "auxiliary_loss_clip": 0.01084541, "auxiliary_loss_mlp": 0.01037598, "balance_loss_clip": 1.01919222, "balance_loss_mlp": 1.02650368, "epoch": 0.2929806102510146, "flos": 19057374944640.0, "grad_norm": 1.8819621063513206, "language_loss": 0.72737408, "learning_rate": 3.211343884914024e-06, "loss": 0.74859548, "num_input_tokens_seen": 105071840, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.58203125, "step": 4873, "time_per_iteration": 3.804901599884033 }, { "auxiliary_loss_clip": 0.01080528, "auxiliary_loss_mlp": 0.01032931, "balance_loss_clip": 1.01541924, "balance_loss_mlp": 1.02258682, "epoch": 0.29304073350368254, "flos": 21943155749760.0, "grad_norm": 3.6052179327819207, "language_loss": 0.78284812, "learning_rate": 3.211043224612481e-06, "loss": 0.80398273, "num_input_tokens_seen": 105089445, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.578125, "step": 4874, "time_per_iteration": 2.3735947608947754 }, { "auxiliary_loss_clip": 0.01084608, "auxiliary_loss_mlp": 0.01032643, "balance_loss_clip": 1.01491666, "balance_loss_mlp": 1.02578545, "epoch": 0.2931008567563505, "flos": 15449904092160.0, "grad_norm": 23.99480383454982, "language_loss": 0.77402413, "learning_rate": 3.2107425210919204e-06, "loss": 0.79519665, "num_input_tokens_seen": 105106210, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.58984375, "step": 4875, "time_per_iteration": 2.3539669513702393 }, { "auxiliary_loss_clip": 0.01084258, "auxiliary_loss_mlp": 0.01032358, "balance_loss_clip": 1.01494789, "balance_loss_mlp": 1.0282594, "epoch": 0.29316098000901847, "flos": 16982207153280.0, "grad_norm": 1.8286495504906035, "language_loss": 0.69097143, "learning_rate": 3.2104417743630742e-06, "loss": 0.71213758, "num_input_tokens_seen": 105124200, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.5625, "step": 4876, "time_per_iteration": 3.7695279121398926 }, { "auxiliary_loss_clip": 0.01080079, "auxiliary_loss_mlp": 0.01032275, "balance_loss_clip": 1.01652789, "balance_loss_mlp": 1.02518678, "epoch": 0.29322110326168643, "flos": 16356912969600.0, "grad_norm": 2.231555279326235, "language_loss": 0.82430893, "learning_rate": 3.2101409844366743e-06, "loss": 0.84543246, "num_input_tokens_seen": 105140400, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.546875, "step": 4877, "time_per_iteration": 2.3721048831939697 }, { "auxiliary_loss_clip": 0.01084541, "auxiliary_loss_mlp": 0.01034515, "balance_loss_clip": 1.01792073, "balance_loss_mlp": 1.02639866, "epoch": 0.29328122651435445, "flos": 13990010924160.0, "grad_norm": 2.421226651180059, "language_loss": 0.68025893, "learning_rate": 3.209840151323456e-06, "loss": 0.70144951, "num_input_tokens_seen": 105157535, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.58203125, "step": 4878, "time_per_iteration": 2.3418586254119873 }, { "auxiliary_loss_clip": 0.01081009, "auxiliary_loss_mlp": 0.01037966, "balance_loss_clip": 1.02038836, "balance_loss_mlp": 1.02516055, "epoch": 0.2933413497670224, "flos": 25263430824960.0, "grad_norm": 2.170631108356211, "language_loss": 0.73805404, "learning_rate": 3.2095392750341543e-06, "loss": 0.75924385, "num_input_tokens_seen": 105175185, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.55859375, "step": 4879, "time_per_iteration": 2.4113054275512695 }, { "auxiliary_loss_clip": 0.01086212, "auxiliary_loss_mlp": 0.01037099, "balance_loss_clip": 1.0185858, "balance_loss_mlp": 1.02735305, "epoch": 0.2934014730196904, "flos": 32122397640960.0, "grad_norm": 1.835718001825746, "language_loss": 0.66645366, "learning_rate": 3.209238355579507e-06, "loss": 0.68768674, "num_input_tokens_seen": 105194540, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.5859375, "step": 4880, "time_per_iteration": 3.8453097343444824 }, { "auxiliary_loss_clip": 0.01081036, "auxiliary_loss_mlp": 0.01039601, "balance_loss_clip": 1.023067, "balance_loss_mlp": 1.02461982, "epoch": 0.29346159627235835, "flos": 24351359800320.0, "grad_norm": 2.006945749269124, "language_loss": 0.69953066, "learning_rate": 3.2089373929702542e-06, "loss": 0.7207371, "num_input_tokens_seen": 105213215, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.56640625, "step": 4881, "time_per_iteration": 2.3981196880340576 }, { "auxiliary_loss_clip": 0.01081542, "auxiliary_loss_mlp": 0.01039028, "balance_loss_clip": 1.02225518, "balance_loss_mlp": 1.02529192, "epoch": 0.2935217195250263, "flos": 22745669328000.0, "grad_norm": 1.553599614974391, "language_loss": 0.83585513, "learning_rate": 3.2086363872171344e-06, "loss": 0.85706079, "num_input_tokens_seen": 105231585, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.5625, "step": 4882, "time_per_iteration": 2.4001901149749756 }, { "auxiliary_loss_clip": 0.0108388, "auxiliary_loss_mlp": 0.01036315, "balance_loss_clip": 1.0185287, "balance_loss_mlp": 1.0260725, "epoch": 0.2935818427776943, "flos": 21724494704640.0, "grad_norm": 2.67324288917706, "language_loss": 0.71457648, "learning_rate": 3.208335338330892e-06, "loss": 0.73577839, "num_input_tokens_seen": 105250120, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.578125, "step": 4883, "time_per_iteration": 3.7564988136291504 }, { "auxiliary_loss_clip": 0.01082902, "auxiliary_loss_mlp": 0.01032904, "balance_loss_clip": 1.01577342, "balance_loss_mlp": 1.02634728, "epoch": 0.29364196603036224, "flos": 23803851859200.0, "grad_norm": 3.0229490266153656, "language_loss": 0.92722136, "learning_rate": 3.2080342463222693e-06, "loss": 0.9483794, "num_input_tokens_seen": 105266065, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.56640625, "step": 4884, "time_per_iteration": 2.3886795043945312 }, { "auxiliary_loss_clip": 0.0108505, "auxiliary_loss_mlp": 0.01036539, "balance_loss_clip": 1.01994467, "balance_loss_mlp": 1.02779257, "epoch": 0.2937020892830302, "flos": 23469139854720.0, "grad_norm": 2.392878217485229, "language_loss": 0.73708129, "learning_rate": 3.207733111202011e-06, "loss": 0.75829715, "num_input_tokens_seen": 105282155, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.57421875, "step": 4885, "time_per_iteration": 2.3783321380615234 }, { "auxiliary_loss_clip": 0.01080522, "auxiliary_loss_mlp": 0.01030078, "balance_loss_clip": 1.0132817, "balance_loss_mlp": 1.02522457, "epoch": 0.2937622125356982, "flos": 24271793078400.0, "grad_norm": 1.8573990103585152, "language_loss": 0.85225159, "learning_rate": 3.2074319329808656e-06, "loss": 0.87335759, "num_input_tokens_seen": 105299225, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.5546875, "step": 4886, "time_per_iteration": 2.3859682083129883 }, { "auxiliary_loss_clip": 0.01081257, "auxiliary_loss_mlp": 0.01034765, "balance_loss_clip": 1.01786137, "balance_loss_mlp": 1.02385736, "epoch": 0.29382233578836614, "flos": 20661564228480.0, "grad_norm": 2.32258663063069, "language_loss": 0.76938081, "learning_rate": 3.2071307116695803e-06, "loss": 0.79054105, "num_input_tokens_seen": 105315710, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.57421875, "step": 4887, "time_per_iteration": 2.353215456008911 }, { "auxiliary_loss_clip": 0.01084492, "auxiliary_loss_mlp": 0.01033258, "balance_loss_clip": 1.01702142, "balance_loss_mlp": 1.02638662, "epoch": 0.2938824590410341, "flos": 16544117013120.0, "grad_norm": 2.8613383860906425, "language_loss": 0.79698789, "learning_rate": 3.2068294472789044e-06, "loss": 0.81816536, "num_input_tokens_seen": 105333505, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.58203125, "step": 4888, "time_per_iteration": 2.3500609397888184 }, { "auxiliary_loss_clip": 0.01078894, "auxiliary_loss_mlp": 0.01032378, "balance_loss_clip": 1.01561737, "balance_loss_mlp": 1.02397156, "epoch": 0.29394258229370207, "flos": 37923949975680.0, "grad_norm": 1.3871355726088788, "language_loss": 0.55150604, "learning_rate": 3.20652813981959e-06, "loss": 0.57261878, "num_input_tokens_seen": 105355605, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.546875, "step": 4889, "time_per_iteration": 2.516327381134033 }, { "auxiliary_loss_clip": 0.0108457, "auxiliary_loss_mlp": 0.01036514, "balance_loss_clip": 1.01814413, "balance_loss_mlp": 1.02591062, "epoch": 0.29400270554637004, "flos": 20043741075840.0, "grad_norm": 1.572949638975355, "language_loss": 0.8448022, "learning_rate": 3.2062267893023903e-06, "loss": 0.86601299, "num_input_tokens_seen": 105374225, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.5859375, "step": 4890, "time_per_iteration": 2.388153314590454 }, { "auxiliary_loss_clip": 0.0108447, "auxiliary_loss_mlp": 0.0103541, "balance_loss_clip": 1.01847041, "balance_loss_mlp": 1.02654934, "epoch": 0.294062828799038, "flos": 15265527868800.0, "grad_norm": 1.7813438056417468, "language_loss": 0.72199506, "learning_rate": 3.205925395738059e-06, "loss": 0.74319386, "num_input_tokens_seen": 105391565, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.578125, "step": 4891, "time_per_iteration": 2.3356659412384033 }, { "auxiliary_loss_clip": 0.0108306, "auxiliary_loss_mlp": 0.01035055, "balance_loss_clip": 1.01757908, "balance_loss_mlp": 1.02644444, "epoch": 0.294122952051706, "flos": 22746053352960.0, "grad_norm": 1.7578053358519676, "language_loss": 0.77017832, "learning_rate": 3.205623959137353e-06, "loss": 0.79135942, "num_input_tokens_seen": 105409840, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.56640625, "step": 4892, "time_per_iteration": 2.372314691543579 }, { "auxiliary_loss_clip": 0.01080153, "auxiliary_loss_mlp": 0.01031026, "balance_loss_clip": 1.01487303, "balance_loss_mlp": 1.02538633, "epoch": 0.294183075304374, "flos": 24971731482240.0, "grad_norm": 1.6844281457659027, "language_loss": 0.78581607, "learning_rate": 3.205322479511028e-06, "loss": 0.8069278, "num_input_tokens_seen": 105428645, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.546875, "step": 4893, "time_per_iteration": 2.3993079662323 }, { "auxiliary_loss_clip": 0.01083619, "auxiliary_loss_mlp": 0.01038484, "balance_loss_clip": 1.02188969, "balance_loss_mlp": 1.02579999, "epoch": 0.29424319855704195, "flos": 30951760020480.0, "grad_norm": 2.0692127805239866, "language_loss": 0.84711272, "learning_rate": 3.205020956869845e-06, "loss": 0.86833376, "num_input_tokens_seen": 105447480, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.578125, "step": 4894, "time_per_iteration": 2.4666411876678467 }, { "auxiliary_loss_clip": 0.01081782, "auxiliary_loss_mlp": 0.01026289, "balance_loss_clip": 1.0100764, "balance_loss_mlp": 1.02415371, "epoch": 0.2943033218097099, "flos": 15230684465280.0, "grad_norm": 2.3165119224286936, "language_loss": 0.9101001, "learning_rate": 3.204719391224563e-06, "loss": 0.93118083, "num_input_tokens_seen": 105464600, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.578125, "step": 4895, "time_per_iteration": 2.3518359661102295 }, { "auxiliary_loss_clip": 0.01083902, "auxiliary_loss_mlp": 0.01039759, "balance_loss_clip": 1.02205598, "balance_loss_mlp": 1.02557957, "epoch": 0.2943634450623779, "flos": 21724808906880.0, "grad_norm": 2.2359087926147607, "language_loss": 0.86197579, "learning_rate": 3.2044177825859457e-06, "loss": 0.88321245, "num_input_tokens_seen": 105481510, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.5859375, "step": 4896, "time_per_iteration": 2.3923354148864746 }, { "auxiliary_loss_clip": 0.01085484, "auxiliary_loss_mlp": 0.01041251, "balance_loss_clip": 1.02265477, "balance_loss_mlp": 1.02733314, "epoch": 0.29442356831504585, "flos": 22600989187200.0, "grad_norm": 1.688096675929124, "language_loss": 0.73318756, "learning_rate": 3.2041161309647555e-06, "loss": 0.75445491, "num_input_tokens_seen": 105501390, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.58203125, "step": 4897, "time_per_iteration": 2.3941261768341064 }, { "auxiliary_loss_clip": 0.01086413, "auxiliary_loss_mlp": 0.01036772, "balance_loss_clip": 1.01761472, "balance_loss_mlp": 1.02478504, "epoch": 0.2944836915677138, "flos": 20010363949440.0, "grad_norm": 1.9960406042886065, "language_loss": 0.73861003, "learning_rate": 3.2038144363717572e-06, "loss": 0.7598418, "num_input_tokens_seen": 105519600, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.6171875, "step": 4898, "time_per_iteration": 2.369438409805298 }, { "auxiliary_loss_clip": 0.01089625, "auxiliary_loss_mlp": 0.01042093, "balance_loss_clip": 1.0222919, "balance_loss_mlp": 1.02782011, "epoch": 0.2945438148203818, "flos": 20044893150720.0, "grad_norm": 3.701299878786074, "language_loss": 0.70102954, "learning_rate": 3.203512698817719e-06, "loss": 0.72234678, "num_input_tokens_seen": 105535970, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.6171875, "step": 4899, "time_per_iteration": 2.359187364578247 }, { "auxiliary_loss_clip": 0.01084016, "auxiliary_loss_mlp": 0.01041504, "balance_loss_clip": 1.02347994, "balance_loss_mlp": 1.02632058, "epoch": 0.29460393807304974, "flos": 23732384572800.0, "grad_norm": 1.9553479039142885, "language_loss": 0.78914893, "learning_rate": 3.2032109183134086e-06, "loss": 0.81040412, "num_input_tokens_seen": 105556735, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.578125, "step": 4900, "time_per_iteration": 2.406459093093872 }, { "auxiliary_loss_clip": 0.01081344, "auxiliary_loss_mlp": 0.01035357, "balance_loss_clip": 1.01802444, "balance_loss_mlp": 1.02383006, "epoch": 0.2946640613257177, "flos": 14975190069120.0, "grad_norm": 1.6438964898485167, "language_loss": 0.80501366, "learning_rate": 3.202909094869595e-06, "loss": 0.82618064, "num_input_tokens_seen": 105574875, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.57421875, "step": 4901, "time_per_iteration": 2.340542793273926 }, { "auxiliary_loss_clip": 0.01076804, "auxiliary_loss_mlp": 0.01029178, "balance_loss_clip": 1.01307261, "balance_loss_mlp": 1.02381194, "epoch": 0.2947241845783857, "flos": 24242744960640.0, "grad_norm": 2.422611514284392, "language_loss": 0.57843292, "learning_rate": 3.2026072284970504e-06, "loss": 0.59949273, "num_input_tokens_seen": 105594225, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.53125, "step": 4902, "time_per_iteration": 2.4023821353912354 }, { "auxiliary_loss_clip": 0.01083378, "auxiliary_loss_mlp": 0.01032702, "balance_loss_clip": 1.0166508, "balance_loss_mlp": 1.02534986, "epoch": 0.29478430783105364, "flos": 19937360563200.0, "grad_norm": 1.7167352084112582, "language_loss": 0.75626671, "learning_rate": 3.202305319206547e-06, "loss": 0.77742743, "num_input_tokens_seen": 105614000, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.578125, "step": 4903, "time_per_iteration": 2.391801595687866 }, { "auxiliary_loss_clip": 0.01084565, "auxiliary_loss_mlp": 0.01040744, "balance_loss_clip": 1.02301753, "balance_loss_mlp": 1.02632689, "epoch": 0.2948444310837216, "flos": 27380110089600.0, "grad_norm": 2.8099633961081496, "language_loss": 0.61930472, "learning_rate": 3.20200336700886e-06, "loss": 0.64055777, "num_input_tokens_seen": 105634575, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.58203125, "step": 4904, "time_per_iteration": 2.4181010723114014 }, { "auxiliary_loss_clip": 0.01083676, "auxiliary_loss_mlp": 0.01033904, "balance_loss_clip": 1.01679766, "balance_loss_mlp": 1.02543235, "epoch": 0.2949045543363896, "flos": 23404305726720.0, "grad_norm": 1.8502228578174662, "language_loss": 0.73049009, "learning_rate": 3.2017013719147644e-06, "loss": 0.75166583, "num_input_tokens_seen": 105654385, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.58203125, "step": 4905, "time_per_iteration": 2.398667812347412 }, { "auxiliary_loss_clip": 0.01081329, "auxiliary_loss_mlp": 0.01033864, "balance_loss_clip": 1.01653099, "balance_loss_mlp": 1.02478731, "epoch": 0.2949646775890576, "flos": 23950347390720.0, "grad_norm": 1.7390085340468906, "language_loss": 0.81068105, "learning_rate": 3.201399333935038e-06, "loss": 0.831833, "num_input_tokens_seen": 105673570, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.56640625, "step": 4906, "time_per_iteration": 2.3977715969085693 }, { "auxiliary_loss_clip": 0.01078128, "auxiliary_loss_mlp": 0.01028579, "balance_loss_clip": 1.01258135, "balance_loss_mlp": 1.02443981, "epoch": 0.29502480084172555, "flos": 22783200906240.0, "grad_norm": 2.603466052185471, "language_loss": 0.87530965, "learning_rate": 3.2010972530804595e-06, "loss": 0.89637673, "num_input_tokens_seen": 105691940, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5390625, "step": 4907, "time_per_iteration": 2.409848690032959 }, { "auxiliary_loss_clip": 0.0108497, "auxiliary_loss_mlp": 0.0103254, "balance_loss_clip": 1.01394343, "balance_loss_mlp": 1.02614701, "epoch": 0.2950849240943935, "flos": 19645626309120.0, "grad_norm": 2.102808847733714, "language_loss": 0.82179803, "learning_rate": 3.20079512936181e-06, "loss": 0.84297311, "num_input_tokens_seen": 105709825, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.5859375, "step": 4908, "time_per_iteration": 2.368774890899658 }, { "auxiliary_loss_clip": 0.0101795, "auxiliary_loss_mlp": 0.01013677, "balance_loss_clip": 1.01192486, "balance_loss_mlp": 1.00435793, "epoch": 0.2951450473470615, "flos": 70999790469120.0, "grad_norm": 0.7751992544529333, "language_loss": 0.57288802, "learning_rate": 3.2004929627898707e-06, "loss": 0.59320438, "num_input_tokens_seen": 105766880, "router_z_loss_clip": 0.01757812, "router_z_loss_mlp": 0.13574219, "step": 4909, "time_per_iteration": 2.909693479537964 }, { "auxiliary_loss_clip": 0.01083434, "auxiliary_loss_mlp": 0.01034157, "balance_loss_clip": 1.01845741, "balance_loss_mlp": 1.0268054, "epoch": 0.29520517059972945, "flos": 22965203157120.0, "grad_norm": 1.615802224826704, "language_loss": 0.86681747, "learning_rate": 3.200190753375426e-06, "loss": 0.88799345, "num_input_tokens_seen": 105786875, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.56640625, "step": 4910, "time_per_iteration": 2.388585090637207 }, { "auxiliary_loss_clip": 0.01077096, "auxiliary_loss_mlp": 0.01034553, "balance_loss_clip": 1.01917529, "balance_loss_mlp": 1.02380323, "epoch": 0.2952652938523974, "flos": 20484624124800.0, "grad_norm": 1.951268358274521, "language_loss": 0.72797281, "learning_rate": 3.1998885011292604e-06, "loss": 0.74908936, "num_input_tokens_seen": 105805315, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.53515625, "step": 4911, "time_per_iteration": 2.3944482803344727 }, { "auxiliary_loss_clip": 0.01082167, "auxiliary_loss_mlp": 0.01028729, "balance_loss_clip": 1.01252866, "balance_loss_mlp": 1.02685213, "epoch": 0.2953254171050654, "flos": 19645556486400.0, "grad_norm": 1.6875099426125348, "language_loss": 0.90211958, "learning_rate": 3.199586206062161e-06, "loss": 0.9232285, "num_input_tokens_seen": 105825125, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.5546875, "step": 4912, "time_per_iteration": 2.3709516525268555 }, { "auxiliary_loss_clip": 0.01082796, "auxiliary_loss_mlp": 0.01041035, "balance_loss_clip": 1.02349925, "balance_loss_mlp": 1.02643323, "epoch": 0.29538554035773334, "flos": 22746856314240.0, "grad_norm": 1.3142472597057169, "language_loss": 0.83268452, "learning_rate": 3.1992838681849153e-06, "loss": 0.85392284, "num_input_tokens_seen": 105846085, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.5625, "step": 4913, "time_per_iteration": 3.7645251750946045 }, { "auxiliary_loss_clip": 0.01083181, "auxiliary_loss_mlp": 0.01033456, "balance_loss_clip": 1.01692128, "balance_loss_mlp": 1.02522612, "epoch": 0.2954456636104013, "flos": 21870780768000.0, "grad_norm": 1.6816325869538873, "language_loss": 0.76519728, "learning_rate": 3.1989814875083134e-06, "loss": 0.7863636, "num_input_tokens_seen": 105865400, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.578125, "step": 4914, "time_per_iteration": 2.385406017303467 }, { "auxiliary_loss_clip": 0.01080662, "auxiliary_loss_mlp": 0.01032381, "balance_loss_clip": 1.01528645, "balance_loss_mlp": 1.02527452, "epoch": 0.2955057868630693, "flos": 40440978334080.0, "grad_norm": 1.7104443636612858, "language_loss": 0.8157649, "learning_rate": 3.198679064043146e-06, "loss": 0.83689535, "num_input_tokens_seen": 105887920, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.5546875, "step": 4915, "time_per_iteration": 2.5420663356781006 }, { "auxiliary_loss_clip": 0.01083894, "auxiliary_loss_mlp": 0.01033532, "balance_loss_clip": 1.01720047, "balance_loss_mlp": 1.02655172, "epoch": 0.29556591011573724, "flos": 22563422697600.0, "grad_norm": 1.9577696244712204, "language_loss": 0.84652781, "learning_rate": 3.1983765978002067e-06, "loss": 0.86770213, "num_input_tokens_seen": 105904035, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.57421875, "step": 4916, "time_per_iteration": 3.798715114593506 }, { "auxiliary_loss_clip": 0.01076681, "auxiliary_loss_mlp": 0.01027232, "balance_loss_clip": 1.01163971, "balance_loss_mlp": 1.023875, "epoch": 0.2956260333684052, "flos": 22088254826880.0, "grad_norm": 2.022261172999719, "language_loss": 0.70051736, "learning_rate": 3.198074088790289e-06, "loss": 0.72155643, "num_input_tokens_seen": 105922685, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.53125, "step": 4917, "time_per_iteration": 2.3779242038726807 }, { "auxiliary_loss_clip": 0.01083227, "auxiliary_loss_mlp": 0.01031051, "balance_loss_clip": 1.01427794, "balance_loss_mlp": 1.02686203, "epoch": 0.2956861566210732, "flos": 16434559566720.0, "grad_norm": 2.12124537751327, "language_loss": 0.90761769, "learning_rate": 3.197771537024189e-06, "loss": 0.92876041, "num_input_tokens_seen": 105940425, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.5625, "step": 4918, "time_per_iteration": 2.363657236099243 }, { "auxiliary_loss_clip": 0.01082568, "auxiliary_loss_mlp": 0.01033852, "balance_loss_clip": 1.0171864, "balance_loss_mlp": 1.02558792, "epoch": 0.2957462798737412, "flos": 25810903854720.0, "grad_norm": 1.9382986504832316, "language_loss": 0.72297627, "learning_rate": 3.197468942512703e-06, "loss": 0.74414045, "num_input_tokens_seen": 105960550, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.5703125, "step": 4919, "time_per_iteration": 2.4097585678100586 }, { "auxiliary_loss_clip": 0.01079496, "auxiliary_loss_mlp": 0.01035669, "balance_loss_clip": 1.0193845, "balance_loss_mlp": 1.02410746, "epoch": 0.29580640312640916, "flos": 16689914317440.0, "grad_norm": 2.188192651828881, "language_loss": 0.75942761, "learning_rate": 3.1971663052666317e-06, "loss": 0.78057921, "num_input_tokens_seen": 105978820, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.5546875, "step": 4920, "time_per_iteration": 3.7057127952575684 }, { "auxiliary_loss_clip": 0.01083026, "auxiliary_loss_mlp": 0.01038267, "balance_loss_clip": 1.02089834, "balance_loss_mlp": 1.02663875, "epoch": 0.2958665263790771, "flos": 23944621927680.0, "grad_norm": 2.154003675044161, "language_loss": 0.68290514, "learning_rate": 3.196863625296775e-06, "loss": 0.70411807, "num_input_tokens_seen": 105997545, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.5625, "step": 4921, "time_per_iteration": 2.4013686180114746 }, { "auxiliary_loss_clip": 0.01084788, "auxiliary_loss_mlp": 0.01036962, "balance_loss_clip": 1.0195272, "balance_loss_mlp": 1.02576828, "epoch": 0.2959266496317451, "flos": 18477432483840.0, "grad_norm": 2.072372345309396, "language_loss": 0.74815679, "learning_rate": 3.1965609026139327e-06, "loss": 0.76937425, "num_input_tokens_seen": 106015320, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.58984375, "step": 4922, "time_per_iteration": 2.362406015396118 }, { "auxiliary_loss_clip": 0.01084972, "auxiliary_loss_mlp": 0.01034915, "balance_loss_clip": 1.01662803, "balance_loss_mlp": 1.02432787, "epoch": 0.29598677288441305, "flos": 25956317134080.0, "grad_norm": 2.026167419747763, "language_loss": 0.76809484, "learning_rate": 3.1962581372289105e-06, "loss": 0.78929377, "num_input_tokens_seen": 106034555, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.60546875, "step": 4923, "time_per_iteration": 3.867509365081787 }, { "auxiliary_loss_clip": 0.01083088, "auxiliary_loss_mlp": 0.01036361, "balance_loss_clip": 1.01955867, "balance_loss_mlp": 1.02602458, "epoch": 0.296046896137081, "flos": 25154815985280.0, "grad_norm": 2.5533045242335186, "language_loss": 0.8641305, "learning_rate": 3.195955329152512e-06, "loss": 0.88532495, "num_input_tokens_seen": 106054200, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.5703125, "step": 4924, "time_per_iteration": 2.3980345726013184 }, { "auxiliary_loss_clip": 0.01081054, "auxiliary_loss_mlp": 0.01033754, "balance_loss_clip": 1.01644504, "balance_loss_mlp": 1.02528882, "epoch": 0.296107019389749, "flos": 21760106158080.0, "grad_norm": 1.65516296503997, "language_loss": 0.81541371, "learning_rate": 3.1956524783955453e-06, "loss": 0.8365618, "num_input_tokens_seen": 106074700, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.55859375, "step": 4925, "time_per_iteration": 2.4050099849700928 }, { "auxiliary_loss_clip": 0.01079202, "auxiliary_loss_mlp": 0.01033993, "balance_loss_clip": 1.01807833, "balance_loss_mlp": 1.02453804, "epoch": 0.29616714264241695, "flos": 17959286862720.0, "grad_norm": 2.4937253093920844, "language_loss": 0.85965389, "learning_rate": 3.195349584968816e-06, "loss": 0.88078582, "num_input_tokens_seen": 106091415, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.546875, "step": 4926, "time_per_iteration": 2.3361332416534424 }, { "auxiliary_loss_clip": 0.01080665, "auxiliary_loss_mlp": 0.01030631, "balance_loss_clip": 1.01423955, "balance_loss_mlp": 1.02393651, "epoch": 0.2962272658950849, "flos": 15011883774720.0, "grad_norm": 1.7826865633422024, "language_loss": 0.85789901, "learning_rate": 3.1950466488831357e-06, "loss": 0.87901199, "num_input_tokens_seen": 106109135, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.56640625, "step": 4927, "time_per_iteration": 2.355961322784424 }, { "auxiliary_loss_clip": 0.01079876, "auxiliary_loss_mlp": 0.01034354, "balance_loss_clip": 1.01846313, "balance_loss_mlp": 1.02535462, "epoch": 0.2962873891477529, "flos": 14719974963840.0, "grad_norm": 1.7001593708258933, "language_loss": 0.80577832, "learning_rate": 3.194743670149314e-06, "loss": 0.82692057, "num_input_tokens_seen": 106125750, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.546875, "step": 4928, "time_per_iteration": 2.3352510929107666 }, { "auxiliary_loss_clip": 0.01088138, "auxiliary_loss_mlp": 0.01038856, "balance_loss_clip": 1.0184716, "balance_loss_mlp": 1.02639675, "epoch": 0.29634751240042084, "flos": 26722590854400.0, "grad_norm": 2.3744074087895477, "language_loss": 0.72309142, "learning_rate": 3.194440648778164e-06, "loss": 0.7443614, "num_input_tokens_seen": 106142835, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 0.6171875, "step": 4929, "time_per_iteration": 2.4388341903686523 }, { "auxiliary_loss_clip": 0.0108588, "auxiliary_loss_mlp": 0.01036753, "balance_loss_clip": 1.01866865, "balance_loss_mlp": 1.02645302, "epoch": 0.2964076356530888, "flos": 14570511966720.0, "grad_norm": 4.212360454262512, "language_loss": 0.71932477, "learning_rate": 3.1941375847805e-06, "loss": 0.74055111, "num_input_tokens_seen": 106160680, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.59375, "step": 4930, "time_per_iteration": 2.3521056175231934 }, { "auxiliary_loss_clip": 0.01016475, "auxiliary_loss_mlp": 0.01000342, "balance_loss_clip": 0.99885148, "balance_loss_mlp": 1.00428259, "epoch": 0.29646775890575683, "flos": 63100969585920.0, "grad_norm": 1.3993840343409092, "language_loss": 0.60672355, "learning_rate": 3.193834478167137e-06, "loss": 0.62689161, "num_input_tokens_seen": 106224415, "router_z_loss_clip": 0.01489258, "router_z_loss_mlp": 0.12207031, "step": 4931, "time_per_iteration": 2.992666482925415 }, { "auxiliary_loss_clip": 0.0101594, "auxiliary_loss_mlp": 0.0100598, "balance_loss_clip": 1.00443029, "balance_loss_mlp": 1.00372422, "epoch": 0.2965278821584248, "flos": 63064345703040.0, "grad_norm": 0.735726169862356, "language_loss": 0.52304494, "learning_rate": 3.1935313289488926e-06, "loss": 0.54326415, "num_input_tokens_seen": 106279140, "router_z_loss_clip": 0.01550293, "router_z_loss_mlp": 0.12207031, "step": 4932, "time_per_iteration": 2.917015552520752 }, { "auxiliary_loss_clip": 0.01081801, "auxiliary_loss_mlp": 0.01035944, "balance_loss_clip": 1.01920736, "balance_loss_mlp": 1.02534056, "epoch": 0.29658800541109276, "flos": 23767612001280.0, "grad_norm": 1.6280007898936617, "language_loss": 0.81764573, "learning_rate": 3.193228137136585e-06, "loss": 0.83882314, "num_input_tokens_seen": 106298190, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.5625, "step": 4933, "time_per_iteration": 2.432817220687866 }, { "auxiliary_loss_clip": 0.01080823, "auxiliary_loss_mlp": 0.0103003, "balance_loss_clip": 1.01380575, "balance_loss_mlp": 1.02674389, "epoch": 0.2966481286637607, "flos": 23987390209920.0, "grad_norm": 1.66283188878693, "language_loss": 0.75357807, "learning_rate": 3.1929249027410347e-06, "loss": 0.77468657, "num_input_tokens_seen": 106319065, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.5390625, "step": 4934, "time_per_iteration": 2.425346851348877 }, { "auxiliary_loss_clip": 0.0108314, "auxiliary_loss_mlp": 0.01037937, "balance_loss_clip": 1.02037764, "balance_loss_mlp": 1.02552247, "epoch": 0.2967082519164287, "flos": 17164209404160.0, "grad_norm": 1.893474981864153, "language_loss": 0.62201482, "learning_rate": 3.1926216257730634e-06, "loss": 0.64322567, "num_input_tokens_seen": 106338040, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.578125, "step": 4935, "time_per_iteration": 2.40461802482605 }, { "auxiliary_loss_clip": 0.01081253, "auxiliary_loss_mlp": 0.0103651, "balance_loss_clip": 1.01964164, "balance_loss_mlp": 1.02624869, "epoch": 0.29676837516909665, "flos": 29386428946560.0, "grad_norm": 1.4360867525035652, "language_loss": 0.79570103, "learning_rate": 3.1923183062434936e-06, "loss": 0.81687868, "num_input_tokens_seen": 106358900, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.55078125, "step": 4936, "time_per_iteration": 2.450254440307617 }, { "auxiliary_loss_clip": 0.01085813, "auxiliary_loss_mlp": 0.0103925, "balance_loss_clip": 1.0218811, "balance_loss_mlp": 1.02724814, "epoch": 0.2968284984217646, "flos": 34749786407040.0, "grad_norm": 1.6828314150332218, "language_loss": 0.74293697, "learning_rate": 3.1920149441631505e-06, "loss": 0.76418757, "num_input_tokens_seen": 106381805, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.5859375, "step": 4937, "time_per_iteration": 2.522477388381958 }, { "auxiliary_loss_clip": 0.01081524, "auxiliary_loss_mlp": 0.01033844, "balance_loss_clip": 1.01664257, "balance_loss_mlp": 1.02645683, "epoch": 0.2968886216744326, "flos": 21543016124160.0, "grad_norm": 1.5303565576356226, "language_loss": 0.78013259, "learning_rate": 3.1917115395428608e-06, "loss": 0.80128628, "num_input_tokens_seen": 106402365, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.55078125, "step": 4938, "time_per_iteration": 2.3948261737823486 }, { "auxiliary_loss_clip": 0.01087227, "auxiliary_loss_mlp": 0.01040686, "balance_loss_clip": 1.02297115, "balance_loss_mlp": 1.028741, "epoch": 0.29694874492710055, "flos": 12786484936320.0, "grad_norm": 2.4509817098247697, "language_loss": 0.76497996, "learning_rate": 3.191408092393451e-06, "loss": 0.78625906, "num_input_tokens_seen": 106419800, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.58203125, "step": 4939, "time_per_iteration": 2.3792197704315186 }, { "auxiliary_loss_clip": 0.01087285, "auxiliary_loss_mlp": 0.0103921, "balance_loss_clip": 1.02199006, "balance_loss_mlp": 1.02810097, "epoch": 0.2970088681797685, "flos": 24568868770560.0, "grad_norm": 1.482221350598616, "language_loss": 0.77775824, "learning_rate": 3.1911046027257516e-06, "loss": 0.79902315, "num_input_tokens_seen": 106440300, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.59375, "step": 4940, "time_per_iteration": 2.4061083793640137 }, { "auxiliary_loss_clip": 0.01084323, "auxiliary_loss_mlp": 0.01033726, "balance_loss_clip": 1.01455736, "balance_loss_mlp": 1.02702928, "epoch": 0.2970689914324365, "flos": 23658054554880.0, "grad_norm": 1.5153718585456948, "language_loss": 0.75121921, "learning_rate": 3.1908010705505925e-06, "loss": 0.77239972, "num_input_tokens_seen": 106460035, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.57421875, "step": 4941, "time_per_iteration": 2.4436583518981934 }, { "auxiliary_loss_clip": 0.01086834, "auxiliary_loss_mlp": 0.01039896, "balance_loss_clip": 1.02140713, "balance_loss_mlp": 1.02710521, "epoch": 0.29712911468510445, "flos": 39668909328000.0, "grad_norm": 24.137535795859353, "language_loss": 0.74060488, "learning_rate": 3.1904974958788065e-06, "loss": 0.76187223, "num_input_tokens_seen": 106481095, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.59765625, "step": 4942, "time_per_iteration": 2.5331871509552 }, { "auxiliary_loss_clip": 0.01086437, "auxiliary_loss_mlp": 0.01039067, "balance_loss_clip": 1.02030301, "balance_loss_mlp": 1.02802205, "epoch": 0.2971892379377724, "flos": 26394127983360.0, "grad_norm": 2.4533612845299944, "language_loss": 0.70337939, "learning_rate": 3.190193878721227e-06, "loss": 0.72463441, "num_input_tokens_seen": 106501590, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.5859375, "step": 4943, "time_per_iteration": 2.4478604793548584 }, { "auxiliary_loss_clip": 0.01087369, "auxiliary_loss_mlp": 0.01032296, "balance_loss_clip": 1.01417589, "balance_loss_mlp": 1.02815938, "epoch": 0.2972493611904404, "flos": 17602229721600.0, "grad_norm": 2.1677638237566024, "language_loss": 0.79639876, "learning_rate": 3.1898902190886898e-06, "loss": 0.81759542, "num_input_tokens_seen": 106519430, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.59375, "step": 4944, "time_per_iteration": 2.354905128479004 }, { "auxiliary_loss_clip": 0.01079669, "auxiliary_loss_mlp": 0.0103301, "balance_loss_clip": 1.01638031, "balance_loss_mlp": 1.02533865, "epoch": 0.2973094844431084, "flos": 20411725472640.0, "grad_norm": 1.9256585896864484, "language_loss": 0.82982606, "learning_rate": 3.1895865169920316e-06, "loss": 0.85095286, "num_input_tokens_seen": 106535870, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.54296875, "step": 4945, "time_per_iteration": 2.3946311473846436 }, { "auxiliary_loss_clip": 0.0107914, "auxiliary_loss_mlp": 0.01035796, "balance_loss_clip": 1.01939237, "balance_loss_mlp": 1.02460909, "epoch": 0.29736960769577636, "flos": 17492532629760.0, "grad_norm": 1.7483272929882534, "language_loss": 0.66412324, "learning_rate": 3.18928277244209e-06, "loss": 0.68527257, "num_input_tokens_seen": 106553560, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.546875, "step": 4946, "time_per_iteration": 2.355820655822754 }, { "auxiliary_loss_clip": 0.01083842, "auxiliary_loss_mlp": 0.0103211, "balance_loss_clip": 1.01539731, "balance_loss_mlp": 1.02757418, "epoch": 0.2974297309484443, "flos": 26102777754240.0, "grad_norm": 1.6484372382581192, "language_loss": 0.73916656, "learning_rate": 3.1889789854497052e-06, "loss": 0.76032609, "num_input_tokens_seen": 106574115, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.5625, "step": 4947, "time_per_iteration": 2.455073118209839 }, { "auxiliary_loss_clip": 0.01085889, "auxiliary_loss_mlp": 0.01036002, "balance_loss_clip": 1.0185616, "balance_loss_mlp": 1.02581787, "epoch": 0.2974898542011123, "flos": 25665246195840.0, "grad_norm": 2.4214897349304167, "language_loss": 0.7344296, "learning_rate": 3.188675156025719e-06, "loss": 0.75564855, "num_input_tokens_seen": 106593070, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.6015625, "step": 4948, "time_per_iteration": 2.4111666679382324 }, { "auxiliary_loss_clip": 0.01080541, "auxiliary_loss_mlp": 0.01030711, "balance_loss_clip": 1.01465368, "balance_loss_mlp": 1.02533805, "epoch": 0.29754997745378026, "flos": 18660342430080.0, "grad_norm": 2.0166661377262405, "language_loss": 0.83489668, "learning_rate": 3.1883712841809752e-06, "loss": 0.85600924, "num_input_tokens_seen": 106610695, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.55078125, "step": 4949, "time_per_iteration": 2.40474009513855 }, { "auxiliary_loss_clip": 0.01079151, "auxiliary_loss_mlp": 0.01031914, "balance_loss_clip": 1.01442552, "balance_loss_mlp": 1.02464867, "epoch": 0.2976101007064482, "flos": 22273468922880.0, "grad_norm": 2.039484776956203, "language_loss": 0.71246374, "learning_rate": 3.188067369926316e-06, "loss": 0.73357439, "num_input_tokens_seen": 106631300, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.54296875, "step": 4950, "time_per_iteration": 2.399895429611206 }, { "auxiliary_loss_clip": 0.01079742, "auxiliary_loss_mlp": 0.01034907, "balance_loss_clip": 1.01941013, "balance_loss_mlp": 1.02599514, "epoch": 0.2976702239591162, "flos": 21944552204160.0, "grad_norm": 1.875904429409681, "language_loss": 0.82162273, "learning_rate": 3.1877634132725887e-06, "loss": 0.84276927, "num_input_tokens_seen": 106650065, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.53515625, "step": 4951, "time_per_iteration": 2.405880928039551 }, { "auxiliary_loss_clip": 0.01079854, "auxiliary_loss_mlp": 0.01030158, "balance_loss_clip": 1.01366544, "balance_loss_mlp": 1.02419114, "epoch": 0.29773034721178415, "flos": 24636251427840.0, "grad_norm": 2.4846012778229496, "language_loss": 0.74077445, "learning_rate": 3.187459414230641e-06, "loss": 0.76187456, "num_input_tokens_seen": 106668230, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.5546875, "step": 4952, "time_per_iteration": 2.4007833003997803 }, { "auxiliary_loss_clip": 0.01082459, "auxiliary_loss_mlp": 0.01036772, "balance_loss_clip": 1.01856911, "balance_loss_mlp": 1.02654946, "epoch": 0.2977904704644521, "flos": 20556545258880.0, "grad_norm": 1.8650635284884682, "language_loss": 0.84202546, "learning_rate": 3.187155372811321e-06, "loss": 0.86321777, "num_input_tokens_seen": 106687785, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.55859375, "step": 4953, "time_per_iteration": 3.8723080158233643 }, { "auxiliary_loss_clip": 0.01079685, "auxiliary_loss_mlp": 0.01035035, "balance_loss_clip": 1.01896596, "balance_loss_mlp": 1.0242126, "epoch": 0.2978505937171201, "flos": 18915452801280.0, "grad_norm": 1.9214609579710038, "language_loss": 0.73884964, "learning_rate": 3.186851289025479e-06, "loss": 0.75999683, "num_input_tokens_seen": 106706875, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5546875, "step": 4954, "time_per_iteration": 2.381962776184082 }, { "auxiliary_loss_clip": 0.01079855, "auxiliary_loss_mlp": 0.01030377, "balance_loss_clip": 1.0141294, "balance_loss_mlp": 1.02533495, "epoch": 0.29791071696978805, "flos": 19316744501760.0, "grad_norm": 2.031766591644584, "language_loss": 0.75790274, "learning_rate": 3.186547162883968e-06, "loss": 0.77900517, "num_input_tokens_seen": 106725105, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.546875, "step": 4955, "time_per_iteration": 2.3933565616607666 }, { "auxiliary_loss_clip": 0.0108229, "auxiliary_loss_mlp": 0.01033677, "balance_loss_clip": 1.01619482, "balance_loss_mlp": 1.02537096, "epoch": 0.297970840222456, "flos": 18805825532160.0, "grad_norm": 1.6318311261163168, "language_loss": 0.72451949, "learning_rate": 3.1862429943976404e-06, "loss": 0.74567914, "num_input_tokens_seen": 106744780, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.5703125, "step": 4956, "time_per_iteration": 3.7838122844696045 }, { "auxiliary_loss_clip": 0.01083506, "auxiliary_loss_mlp": 0.01043598, "balance_loss_clip": 1.02627647, "balance_loss_mlp": 1.02395868, "epoch": 0.298030963475124, "flos": 22851770549760.0, "grad_norm": 4.2951227468985165, "language_loss": 0.7899521, "learning_rate": 3.1859387835773525e-06, "loss": 0.81122315, "num_input_tokens_seen": 106764670, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.59765625, "step": 4957, "time_per_iteration": 2.4262795448303223 }, { "auxiliary_loss_clip": 0.01080574, "auxiliary_loss_mlp": 0.01036749, "balance_loss_clip": 1.01974988, "balance_loss_mlp": 1.02406991, "epoch": 0.298091086727792, "flos": 21867499100160.0, "grad_norm": 1.5023480559888165, "language_loss": 0.70402986, "learning_rate": 3.1856345304339593e-06, "loss": 0.72520304, "num_input_tokens_seen": 106783695, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.5625, "step": 4958, "time_per_iteration": 2.400470733642578 }, { "auxiliary_loss_clip": 0.01082169, "auxiliary_loss_mlp": 0.01035256, "balance_loss_clip": 1.01923442, "balance_loss_mlp": 1.02732944, "epoch": 0.29815120998045996, "flos": 21174054209280.0, "grad_norm": 1.6193798431206679, "language_loss": 0.78985393, "learning_rate": 3.1853302349783197e-06, "loss": 0.81102812, "num_input_tokens_seen": 106803150, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.55078125, "step": 4959, "time_per_iteration": 3.7395496368408203 }, { "auxiliary_loss_clip": 0.01077656, "auxiliary_loss_mlp": 0.0103347, "balance_loss_clip": 1.01747179, "balance_loss_mlp": 1.02345788, "epoch": 0.29821133323312793, "flos": 19895395242240.0, "grad_norm": 1.8592494653388847, "language_loss": 0.79433644, "learning_rate": 3.185025897221293e-06, "loss": 0.81544769, "num_input_tokens_seen": 106820705, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.54296875, "step": 4960, "time_per_iteration": 2.3856394290924072 }, { "auxiliary_loss_clip": 0.01081901, "auxiliary_loss_mlp": 0.01032479, "balance_loss_clip": 1.01484823, "balance_loss_mlp": 1.02502084, "epoch": 0.2982714564857959, "flos": 12749930876160.0, "grad_norm": 2.561922803602244, "language_loss": 0.74007982, "learning_rate": 3.1847215171737406e-06, "loss": 0.76122361, "num_input_tokens_seen": 106837335, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.5703125, "step": 4961, "time_per_iteration": 2.34979510307312 }, { "auxiliary_loss_clip": 0.01079736, "auxiliary_loss_mlp": 0.01027939, "balance_loss_clip": 1.01259637, "balance_loss_mlp": 1.02538598, "epoch": 0.29833157973846386, "flos": 22270850570880.0, "grad_norm": 1.7412902746459413, "language_loss": 0.62228787, "learning_rate": 3.1844170948465246e-06, "loss": 0.64336461, "num_input_tokens_seen": 106856250, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.54296875, "step": 4962, "time_per_iteration": 3.761460542678833 }, { "auxiliary_loss_clip": 0.01080534, "auxiliary_loss_mlp": 0.01042533, "balance_loss_clip": 1.02429366, "balance_loss_mlp": 1.02536082, "epoch": 0.2983917029911318, "flos": 15372222583680.0, "grad_norm": 1.8940844573085047, "language_loss": 0.83450472, "learning_rate": 3.184112630250509e-06, "loss": 0.85573542, "num_input_tokens_seen": 106873370, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.55078125, "step": 4963, "time_per_iteration": 2.3452987670898438 }, { "auxiliary_loss_clip": 0.01081522, "auxiliary_loss_mlp": 0.01029655, "balance_loss_clip": 1.01291764, "balance_loss_mlp": 1.02615285, "epoch": 0.2984518262437998, "flos": 15376726149120.0, "grad_norm": 2.2477371080255995, "language_loss": 0.66339022, "learning_rate": 3.1838081233965595e-06, "loss": 0.68450201, "num_input_tokens_seen": 106890330, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.5546875, "step": 4964, "time_per_iteration": 2.354438543319702 }, { "auxiliary_loss_clip": 0.01078215, "auxiliary_loss_mlp": 0.01027563, "balance_loss_clip": 1.01223254, "balance_loss_mlp": 1.0245465, "epoch": 0.29851194949646775, "flos": 18107632696320.0, "grad_norm": 1.7220333178811484, "language_loss": 0.71495241, "learning_rate": 3.1835035742955435e-06, "loss": 0.73601019, "num_input_tokens_seen": 106909190, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.53515625, "step": 4965, "time_per_iteration": 2.3654918670654297 }, { "auxiliary_loss_clip": 0.01084846, "auxiliary_loss_mlp": 0.01032337, "balance_loss_clip": 1.01536155, "balance_loss_mlp": 1.02781796, "epoch": 0.2985720727491357, "flos": 22017136654080.0, "grad_norm": 1.788455229308673, "language_loss": 0.66098297, "learning_rate": 3.1831989829583286e-06, "loss": 0.68215483, "num_input_tokens_seen": 106927825, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.5703125, "step": 4966, "time_per_iteration": 2.3892927169799805 }, { "auxiliary_loss_clip": 0.01085005, "auxiliary_loss_mlp": 0.01036058, "balance_loss_clip": 1.01940417, "balance_loss_mlp": 1.0272212, "epoch": 0.2986321960018037, "flos": 13040547966720.0, "grad_norm": 2.4757151727883597, "language_loss": 0.74111062, "learning_rate": 3.182894349395787e-06, "loss": 0.76232123, "num_input_tokens_seen": 106943155, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.578125, "step": 4967, "time_per_iteration": 2.3399319648742676 }, { "auxiliary_loss_clip": 0.01079334, "auxiliary_loss_mlp": 0.01029739, "balance_loss_clip": 1.01384878, "balance_loss_mlp": 1.02384973, "epoch": 0.29869231925447165, "flos": 14464166365440.0, "grad_norm": 1.9589136430162541, "language_loss": 0.71396685, "learning_rate": 3.1825896736187876e-06, "loss": 0.73505759, "num_input_tokens_seen": 106960295, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.5546875, "step": 4968, "time_per_iteration": 2.347165107727051 }, { "auxiliary_loss_clip": 0.01080485, "auxiliary_loss_mlp": 0.01030845, "balance_loss_clip": 1.01346445, "balance_loss_mlp": 1.02313113, "epoch": 0.2987524425071396, "flos": 31648870604160.0, "grad_norm": 1.7095628584243794, "language_loss": 0.76583636, "learning_rate": 3.182284955638205e-06, "loss": 0.78694969, "num_input_tokens_seen": 106982870, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.57421875, "step": 4969, "time_per_iteration": 2.4576480388641357 }, { "auxiliary_loss_clip": 0.01081117, "auxiliary_loss_mlp": 0.01029858, "balance_loss_clip": 1.01413393, "balance_loss_mlp": 1.02602363, "epoch": 0.2988125657598076, "flos": 21432376425600.0, "grad_norm": 1.7152626049217727, "language_loss": 0.6997999, "learning_rate": 3.181980195464913e-06, "loss": 0.72090966, "num_input_tokens_seen": 107002405, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.55078125, "step": 4970, "time_per_iteration": 2.4025802612304688 }, { "auxiliary_loss_clip": 0.01082983, "auxiliary_loss_mlp": 0.01036306, "balance_loss_clip": 1.01765001, "balance_loss_mlp": 1.02405429, "epoch": 0.2988726890124756, "flos": 18076001137920.0, "grad_norm": 2.1080221485636597, "language_loss": 0.85170591, "learning_rate": 3.1816753931097894e-06, "loss": 0.87289882, "num_input_tokens_seen": 107017310, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.58984375, "step": 4971, "time_per_iteration": 2.345571756362915 }, { "auxiliary_loss_clip": 0.01077978, "auxiliary_loss_mlp": 0.01035813, "balance_loss_clip": 1.0189805, "balance_loss_mlp": 1.02405381, "epoch": 0.29893281226514357, "flos": 21754764720000.0, "grad_norm": 2.1463429817387962, "language_loss": 0.79577583, "learning_rate": 3.1813705485837095e-06, "loss": 0.81691372, "num_input_tokens_seen": 107034645, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.5390625, "step": 4972, "time_per_iteration": 2.3881871700286865 }, { "auxiliary_loss_clip": 0.01081858, "auxiliary_loss_mlp": 0.01037044, "balance_loss_clip": 1.01960325, "balance_loss_mlp": 1.02570879, "epoch": 0.29899293551781153, "flos": 16835781444480.0, "grad_norm": 1.9887407082693163, "language_loss": 0.85172081, "learning_rate": 3.1810656618975544e-06, "loss": 0.87290978, "num_input_tokens_seen": 107051125, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.5625, "step": 4973, "time_per_iteration": 2.343738555908203 }, { "auxiliary_loss_clip": 0.01078965, "auxiliary_loss_mlp": 0.01038845, "balance_loss_clip": 1.02296674, "balance_loss_mlp": 1.02659976, "epoch": 0.2990530587704795, "flos": 11728407139200.0, "grad_norm": 1.5730562905210672, "language_loss": 0.77422863, "learning_rate": 3.180760733062204e-06, "loss": 0.7954067, "num_input_tokens_seen": 107068815, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.5234375, "step": 4974, "time_per_iteration": 2.3843629360198975 }, { "auxiliary_loss_clip": 0.01082426, "auxiliary_loss_mlp": 0.01035694, "balance_loss_clip": 1.01911223, "balance_loss_mlp": 1.0247128, "epoch": 0.29911318202314746, "flos": 28038571931520.0, "grad_norm": 1.7122826057419647, "language_loss": 0.7228446, "learning_rate": 3.1804557620885396e-06, "loss": 0.74402583, "num_input_tokens_seen": 107090420, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.578125, "step": 4975, "time_per_iteration": 2.4561450481414795 }, { "auxiliary_loss_clip": 0.01081795, "auxiliary_loss_mlp": 0.01031848, "balance_loss_clip": 1.0148015, "balance_loss_mlp": 1.02685761, "epoch": 0.2991733052758154, "flos": 18732577766400.0, "grad_norm": 2.1044543390603736, "language_loss": 0.75725859, "learning_rate": 3.1801507489874453e-06, "loss": 0.778395, "num_input_tokens_seen": 107107255, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.55078125, "step": 4976, "time_per_iteration": 2.365079879760742 }, { "auxiliary_loss_clip": 0.01080687, "auxiliary_loss_mlp": 0.01033974, "balance_loss_clip": 1.01796412, "balance_loss_mlp": 1.02608037, "epoch": 0.2992334285284834, "flos": 15558274552320.0, "grad_norm": 2.142849305216903, "language_loss": 0.86118251, "learning_rate": 3.1798456937698073e-06, "loss": 0.88232917, "num_input_tokens_seen": 107123840, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.546875, "step": 4977, "time_per_iteration": 2.33827805519104 }, { "auxiliary_loss_clip": 0.01082456, "auxiliary_loss_mlp": 0.01034243, "balance_loss_clip": 1.01756597, "balance_loss_mlp": 1.02657115, "epoch": 0.29929355178115136, "flos": 21796520572800.0, "grad_norm": 1.6709021316852026, "language_loss": 0.68248498, "learning_rate": 3.1795405964465114e-06, "loss": 0.70365196, "num_input_tokens_seen": 107143475, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.55859375, "step": 4978, "time_per_iteration": 2.3888306617736816 }, { "auxiliary_loss_clip": 0.01081182, "auxiliary_loss_mlp": 0.01032142, "balance_loss_clip": 1.01507092, "balance_loss_mlp": 1.02576137, "epoch": 0.2993536750338193, "flos": 21177475522560.0, "grad_norm": 2.7338189545992795, "language_loss": 0.75924754, "learning_rate": 3.1792354570284452e-06, "loss": 0.78038073, "num_input_tokens_seen": 107161725, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.5546875, "step": 4979, "time_per_iteration": 2.379936695098877 }, { "auxiliary_loss_clip": 0.01080065, "auxiliary_loss_mlp": 0.01034341, "balance_loss_clip": 1.01705599, "balance_loss_mlp": 1.02382302, "epoch": 0.2994137982864873, "flos": 32120826629760.0, "grad_norm": 1.655006270921756, "language_loss": 0.68303317, "learning_rate": 3.1789302755264996e-06, "loss": 0.70417726, "num_input_tokens_seen": 107183935, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.5625, "step": 4980, "time_per_iteration": 2.459005832672119 }, { "auxiliary_loss_clip": 0.01079715, "auxiliary_loss_mlp": 0.01035819, "balance_loss_clip": 1.01988077, "balance_loss_mlp": 1.02612805, "epoch": 0.29947392153915525, "flos": 21104367402240.0, "grad_norm": 1.773695016630351, "language_loss": 0.73461616, "learning_rate": 3.178625051951564e-06, "loss": 0.75577152, "num_input_tokens_seen": 107204285, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.53515625, "step": 4981, "time_per_iteration": 2.4080123901367188 }, { "auxiliary_loss_clip": 0.01080974, "auxiliary_loss_mlp": 0.01032784, "balance_loss_clip": 1.01629722, "balance_loss_mlp": 1.02344227, "epoch": 0.2995340447918232, "flos": 21541584758400.0, "grad_norm": 1.5953135557033733, "language_loss": 0.86637998, "learning_rate": 3.1783197863145335e-06, "loss": 0.88751757, "num_input_tokens_seen": 107225265, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.57421875, "step": 4982, "time_per_iteration": 2.385634183883667 }, { "auxiliary_loss_clip": 0.01083326, "auxiliary_loss_mlp": 0.01036894, "balance_loss_clip": 1.01855969, "balance_loss_mlp": 1.02523935, "epoch": 0.2995941680444912, "flos": 16724268961920.0, "grad_norm": 2.064618365489174, "language_loss": 0.86887771, "learning_rate": 3.1780144786262997e-06, "loss": 0.89007992, "num_input_tokens_seen": 107241335, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.58203125, "step": 4983, "time_per_iteration": 2.349159002304077 }, { "auxiliary_loss_clip": 0.01082124, "auxiliary_loss_mlp": 0.01033313, "balance_loss_clip": 1.01704073, "balance_loss_mlp": 1.02556515, "epoch": 0.2996542912971592, "flos": 20922434974080.0, "grad_norm": 2.6680200032774133, "language_loss": 0.78614646, "learning_rate": 3.17770912889776e-06, "loss": 0.80730087, "num_input_tokens_seen": 107259375, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.5625, "step": 4984, "time_per_iteration": 2.3706724643707275 }, { "auxiliary_loss_clip": 0.0108325, "auxiliary_loss_mlp": 0.01035547, "balance_loss_clip": 1.01866663, "balance_loss_mlp": 1.02616, "epoch": 0.29971441454982717, "flos": 25078775310720.0, "grad_norm": 1.5860478498349024, "language_loss": 0.78228557, "learning_rate": 3.17740373713981e-06, "loss": 0.80347353, "num_input_tokens_seen": 107279890, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.5703125, "step": 4985, "time_per_iteration": 2.4217989444732666 }, { "auxiliary_loss_clip": 0.01084551, "auxiliary_loss_mlp": 0.01035965, "balance_loss_clip": 1.01721382, "balance_loss_mlp": 1.02559328, "epoch": 0.29977453780249513, "flos": 52553989543680.0, "grad_norm": 1.9692385439801579, "language_loss": 0.71667582, "learning_rate": 3.1770983033633504e-06, "loss": 0.73788095, "num_input_tokens_seen": 107303430, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.58984375, "step": 4986, "time_per_iteration": 2.653592348098755 }, { "auxiliary_loss_clip": 0.01084353, "auxiliary_loss_mlp": 0.01040687, "balance_loss_clip": 1.02284098, "balance_loss_mlp": 1.02591658, "epoch": 0.2998346610551631, "flos": 22236042078720.0, "grad_norm": 2.0502182867088186, "language_loss": 0.73531449, "learning_rate": 3.1767928275792796e-06, "loss": 0.75656486, "num_input_tokens_seen": 107323700, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.58203125, "step": 4987, "time_per_iteration": 2.391425609588623 }, { "auxiliary_loss_clip": 0.01079211, "auxiliary_loss_mlp": 0.01032974, "balance_loss_clip": 1.01742935, "balance_loss_mlp": 1.02483892, "epoch": 0.29989478430783106, "flos": 16872265681920.0, "grad_norm": 2.0424406972309375, "language_loss": 0.80119443, "learning_rate": 3.1764873097984997e-06, "loss": 0.82231629, "num_input_tokens_seen": 107341965, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.54296875, "step": 4988, "time_per_iteration": 2.345507860183716 }, { "auxiliary_loss_clip": 0.01080255, "auxiliary_loss_mlp": 0.01037872, "balance_loss_clip": 1.020926, "balance_loss_mlp": 1.024441, "epoch": 0.29995490756049903, "flos": 23767751646720.0, "grad_norm": 2.341136429212972, "language_loss": 0.70591819, "learning_rate": 3.1761817500319143e-06, "loss": 0.72709942, "num_input_tokens_seen": 107362615, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.55859375, "step": 4989, "time_per_iteration": 2.406156301498413 }, { "auxiliary_loss_clip": 0.01085645, "auxiliary_loss_mlp": 0.01034608, "balance_loss_clip": 1.01725078, "balance_loss_mlp": 1.02865005, "epoch": 0.300015030813167, "flos": 14464445656320.0, "grad_norm": 2.1163616930816005, "language_loss": 0.85251993, "learning_rate": 3.175876148290428e-06, "loss": 0.87372243, "num_input_tokens_seen": 107378980, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.5703125, "step": 4990, "time_per_iteration": 2.3781583309173584 }, { "auxiliary_loss_clip": 0.0108342, "auxiliary_loss_mlp": 0.01034322, "balance_loss_clip": 1.01616633, "balance_loss_mlp": 1.02503562, "epoch": 0.30007515406583496, "flos": 25190811463680.0, "grad_norm": 1.7912572462240683, "language_loss": 0.67249948, "learning_rate": 3.175570504584946e-06, "loss": 0.69367695, "num_input_tokens_seen": 107397640, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.58203125, "step": 4991, "time_per_iteration": 2.4635307788848877 }, { "auxiliary_loss_clip": 0.01083613, "auxiliary_loss_mlp": 0.01040097, "balance_loss_clip": 1.02029645, "balance_loss_mlp": 1.02456439, "epoch": 0.3001352773185029, "flos": 19390166824320.0, "grad_norm": 1.8452541622253724, "language_loss": 0.78739929, "learning_rate": 3.175264818926377e-06, "loss": 0.80863643, "num_input_tokens_seen": 107416020, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.58984375, "step": 4992, "time_per_iteration": 3.789320945739746 }, { "auxiliary_loss_clip": 0.01079713, "auxiliary_loss_mlp": 0.01036541, "balance_loss_clip": 1.01939869, "balance_loss_mlp": 1.02497733, "epoch": 0.3001954005711709, "flos": 21542771744640.0, "grad_norm": 1.7661513954177344, "language_loss": 0.82482982, "learning_rate": 3.17495909132563e-06, "loss": 0.84599233, "num_input_tokens_seen": 107436340, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.546875, "step": 4993, "time_per_iteration": 2.4323110580444336 }, { "auxiliary_loss_clip": 0.01085163, "auxiliary_loss_mlp": 0.01040835, "balance_loss_clip": 1.02097487, "balance_loss_mlp": 1.02445126, "epoch": 0.30025552382383885, "flos": 17383359208320.0, "grad_norm": 2.2549975862711196, "language_loss": 0.85495508, "learning_rate": 3.174653321793615e-06, "loss": 0.8762151, "num_input_tokens_seen": 107454585, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.60546875, "step": 4994, "time_per_iteration": 2.373474359512329 }, { "auxiliary_loss_clip": 0.01083051, "auxiliary_loss_mlp": 0.01033208, "balance_loss_clip": 1.01686478, "balance_loss_mlp": 1.02654529, "epoch": 0.3003156470765068, "flos": 29532051694080.0, "grad_norm": 2.008891863343585, "language_loss": 0.81047344, "learning_rate": 3.1743475103412446e-06, "loss": 0.83163607, "num_input_tokens_seen": 107477180, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.56640625, "step": 4995, "time_per_iteration": 2.477457046508789 }, { "auxiliary_loss_clip": 0.010792, "auxiliary_loss_mlp": 0.0103423, "balance_loss_clip": 1.01655102, "balance_loss_mlp": 1.02507055, "epoch": 0.3003757703291748, "flos": 43644923159040.0, "grad_norm": 1.6878801284506146, "language_loss": 0.67329788, "learning_rate": 3.174041656979432e-06, "loss": 0.69443214, "num_input_tokens_seen": 107500250, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.5390625, "step": 4996, "time_per_iteration": 4.041480541229248 }, { "auxiliary_loss_clip": 0.01081071, "auxiliary_loss_mlp": 0.01036733, "balance_loss_clip": 1.01979363, "balance_loss_mlp": 1.02459562, "epoch": 0.30043589358184275, "flos": 22527287573760.0, "grad_norm": 2.1129496227938844, "language_loss": 0.75430369, "learning_rate": 3.1737357617190935e-06, "loss": 0.77548176, "num_input_tokens_seen": 107520070, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.5625, "step": 4997, "time_per_iteration": 2.545781135559082 }, { "auxiliary_loss_clip": 0.01077258, "auxiliary_loss_mlp": 0.01026782, "balance_loss_clip": 1.01080751, "balance_loss_mlp": 1.02360058, "epoch": 0.30049601683451077, "flos": 20994844867200.0, "grad_norm": 1.6812323361446717, "language_loss": 0.77757078, "learning_rate": 3.1734298245711443e-06, "loss": 0.79861116, "num_input_tokens_seen": 107539285, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.53515625, "step": 4998, "time_per_iteration": 4.003045082092285 }, { "auxiliary_loss_clip": 0.01077762, "auxiliary_loss_mlp": 0.01033633, "balance_loss_clip": 1.01793289, "balance_loss_mlp": 1.02437735, "epoch": 0.30055614008717874, "flos": 23914840671360.0, "grad_norm": 1.5437849077527765, "language_loss": 0.72840375, "learning_rate": 3.1731238455465033e-06, "loss": 0.74951768, "num_input_tokens_seen": 107560260, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.53515625, "step": 4999, "time_per_iteration": 2.5262911319732666 }, { "auxiliary_loss_clip": 0.01080357, "auxiliary_loss_mlp": 0.01035271, "balance_loss_clip": 1.01815248, "balance_loss_mlp": 1.0255444, "epoch": 0.3006162633398467, "flos": 19168852515840.0, "grad_norm": 1.5843387602958585, "language_loss": 0.75770509, "learning_rate": 3.1728178246560903e-06, "loss": 0.7788614, "num_input_tokens_seen": 107579260, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.546875, "step": 5000, "time_per_iteration": 2.4891839027404785 }, { "auxiliary_loss_clip": 0.01079471, "auxiliary_loss_mlp": 0.0103528, "balance_loss_clip": 1.01942515, "balance_loss_mlp": 1.02659953, "epoch": 0.30067638659251467, "flos": 14678498401920.0, "grad_norm": 2.726883484417335, "language_loss": 0.81758177, "learning_rate": 3.172511761910825e-06, "loss": 0.83872926, "num_input_tokens_seen": 107595245, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.53125, "step": 5001, "time_per_iteration": 2.4570488929748535 }, { "auxiliary_loss_clip": 0.01081517, "auxiliary_loss_mlp": 0.01037039, "balance_loss_clip": 1.01971793, "balance_loss_mlp": 1.0257709, "epoch": 0.30073650984518263, "flos": 23366878882560.0, "grad_norm": 2.1044792838413735, "language_loss": 0.80636716, "learning_rate": 3.1722056573216315e-06, "loss": 0.82755268, "num_input_tokens_seen": 107613985, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.5546875, "step": 5002, "time_per_iteration": 3.8591198921203613 }, { "auxiliary_loss_clip": 0.0108517, "auxiliary_loss_mlp": 0.01032671, "balance_loss_clip": 1.01599324, "balance_loss_mlp": 1.0265274, "epoch": 0.3007966330978506, "flos": 22965517359360.0, "grad_norm": 1.8714947113016105, "language_loss": 0.71259362, "learning_rate": 3.1718995108994336e-06, "loss": 0.73377204, "num_input_tokens_seen": 107631435, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.5859375, "step": 5003, "time_per_iteration": 2.400160551071167 }, { "auxiliary_loss_clip": 0.01082306, "auxiliary_loss_mlp": 0.01034596, "balance_loss_clip": 1.0175848, "balance_loss_mlp": 1.02557886, "epoch": 0.30085675635051856, "flos": 27817222711680.0, "grad_norm": 1.7953384054079804, "language_loss": 0.70450509, "learning_rate": 3.1715933226551562e-06, "loss": 0.72567403, "num_input_tokens_seen": 107650530, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.56640625, "step": 5004, "time_per_iteration": 2.440431594848633 }, { "auxiliary_loss_clip": 0.01083846, "auxiliary_loss_mlp": 0.01036555, "balance_loss_clip": 1.01845884, "balance_loss_mlp": 1.02555871, "epoch": 0.3009168796031865, "flos": 10882147760640.0, "grad_norm": 2.6380312247766984, "language_loss": 0.81586128, "learning_rate": 3.171287092599727e-06, "loss": 0.83706534, "num_input_tokens_seen": 107662240, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.58203125, "step": 5005, "time_per_iteration": 2.3403513431549072 }, { "auxiliary_loss_clip": 0.01080729, "auxiliary_loss_mlp": 0.01031921, "balance_loss_clip": 1.01595902, "balance_loss_mlp": 1.02547896, "epoch": 0.3009770028558545, "flos": 23804270795520.0, "grad_norm": 2.448149402759662, "language_loss": 0.74657762, "learning_rate": 3.1709808207440745e-06, "loss": 0.76770413, "num_input_tokens_seen": 107680330, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.55078125, "step": 5006, "time_per_iteration": 2.3962249755859375 }, { "auxiliary_loss_clip": 0.01080192, "auxiliary_loss_mlp": 0.01032461, "balance_loss_clip": 1.01655793, "balance_loss_mlp": 1.02567017, "epoch": 0.30103712610852246, "flos": 26467026635520.0, "grad_norm": 1.7758321801712973, "language_loss": 0.71093178, "learning_rate": 3.170674507099128e-06, "loss": 0.73205829, "num_input_tokens_seen": 107700020, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.546875, "step": 5007, "time_per_iteration": 2.432056188583374 }, { "auxiliary_loss_clip": 0.01080575, "auxiliary_loss_mlp": 0.01028311, "balance_loss_clip": 1.01070428, "balance_loss_mlp": 1.02538824, "epoch": 0.3010972493611904, "flos": 22855366419840.0, "grad_norm": 2.3228593027480535, "language_loss": 0.76148784, "learning_rate": 3.17036815167582e-06, "loss": 0.78257668, "num_input_tokens_seen": 107718575, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.55078125, "step": 5008, "time_per_iteration": 2.384437084197998 }, { "auxiliary_loss_clip": 0.01080406, "auxiliary_loss_mlp": 0.01032703, "balance_loss_clip": 1.01582277, "balance_loss_mlp": 1.02531934, "epoch": 0.3011573726138584, "flos": 24052748008320.0, "grad_norm": 2.308683541342158, "language_loss": 0.84400833, "learning_rate": 3.170061754485084e-06, "loss": 0.86513948, "num_input_tokens_seen": 107738635, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.55078125, "step": 5009, "time_per_iteration": 2.4246981143951416 }, { "auxiliary_loss_clip": 0.01082541, "auxiliary_loss_mlp": 0.01035745, "balance_loss_clip": 1.0166595, "balance_loss_mlp": 1.02468181, "epoch": 0.30121749586652635, "flos": 20258841162240.0, "grad_norm": 1.9321459927068347, "language_loss": 0.83378142, "learning_rate": 3.1697553155378527e-06, "loss": 0.85496426, "num_input_tokens_seen": 107753415, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.578125, "step": 5010, "time_per_iteration": 2.3626821041107178 }, { "auxiliary_loss_clip": 0.01081279, "auxiliary_loss_mlp": 0.01025883, "balance_loss_clip": 1.00958765, "balance_loss_mlp": 1.02533436, "epoch": 0.3012776191191944, "flos": 26941845392640.0, "grad_norm": 2.27009907400155, "language_loss": 0.8497237, "learning_rate": 3.1694488348450636e-06, "loss": 0.87079531, "num_input_tokens_seen": 107773840, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.55859375, "step": 5011, "time_per_iteration": 2.488070011138916 }, { "auxiliary_loss_clip": 0.01081505, "auxiliary_loss_mlp": 0.01033962, "balance_loss_clip": 1.01608098, "balance_loss_mlp": 1.02349353, "epoch": 0.30133774237186234, "flos": 20411271624960.0, "grad_norm": 1.9796811445131552, "language_loss": 0.72257864, "learning_rate": 3.169142312417654e-06, "loss": 0.74373329, "num_input_tokens_seen": 107792020, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.578125, "step": 5012, "time_per_iteration": 2.381861686706543 }, { "auxiliary_loss_clip": 0.01080649, "auxiliary_loss_mlp": 0.01028772, "balance_loss_clip": 1.01197577, "balance_loss_mlp": 1.02380049, "epoch": 0.3013978656245303, "flos": 19791423613440.0, "grad_norm": 2.4962543051630366, "language_loss": 0.87719458, "learning_rate": 3.1688357482665622e-06, "loss": 0.89828885, "num_input_tokens_seen": 107809595, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.5703125, "step": 5013, "time_per_iteration": 2.4279744625091553 }, { "auxiliary_loss_clip": 0.01083778, "auxiliary_loss_mlp": 0.01034351, "balance_loss_clip": 1.01557589, "balance_loss_mlp": 1.02564931, "epoch": 0.30145798887719827, "flos": 16248821800320.0, "grad_norm": 1.9163758464873473, "language_loss": 0.83223724, "learning_rate": 3.1685291424027293e-06, "loss": 0.85341853, "num_input_tokens_seen": 107827230, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.578125, "step": 5014, "time_per_iteration": 2.4155304431915283 }, { "auxiliary_loss_clip": 0.01076556, "auxiliary_loss_mlp": 0.0103143, "balance_loss_clip": 1.01521778, "balance_loss_mlp": 1.02403033, "epoch": 0.30151811212986623, "flos": 24570579427200.0, "grad_norm": 1.5817098941484453, "language_loss": 0.68388069, "learning_rate": 3.1682224948370973e-06, "loss": 0.70496058, "num_input_tokens_seen": 107847195, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.5234375, "step": 5015, "time_per_iteration": 2.418494462966919 }, { "auxiliary_loss_clip": 0.01081318, "auxiliary_loss_mlp": 0.01035464, "balance_loss_clip": 1.01878095, "balance_loss_mlp": 1.02516627, "epoch": 0.3015782353825342, "flos": 21870990236160.0, "grad_norm": 2.1210306427495307, "language_loss": 0.74649143, "learning_rate": 3.1679158055806096e-06, "loss": 0.76765931, "num_input_tokens_seen": 107866420, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.5625, "step": 5016, "time_per_iteration": 2.376962184906006 }, { "auxiliary_loss_clip": 0.01083765, "auxiliary_loss_mlp": 0.01030632, "balance_loss_clip": 1.01298952, "balance_loss_mlp": 1.02632689, "epoch": 0.30163835863520216, "flos": 28768012300800.0, "grad_norm": 1.5494112497778512, "language_loss": 0.65576136, "learning_rate": 3.1676090746442105e-06, "loss": 0.67690539, "num_input_tokens_seen": 107889090, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.57421875, "step": 5017, "time_per_iteration": 2.461987257003784 }, { "auxiliary_loss_clip": 0.01081179, "auxiliary_loss_mlp": 0.01033032, "balance_loss_clip": 1.01597965, "balance_loss_mlp": 1.02568746, "epoch": 0.30169848188787013, "flos": 22965098423040.0, "grad_norm": 2.3028788019207505, "language_loss": 0.68410343, "learning_rate": 3.1673023020388473e-06, "loss": 0.70524549, "num_input_tokens_seen": 107907520, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.5546875, "step": 5018, "time_per_iteration": 2.3817498683929443 }, { "auxiliary_loss_clip": 0.01075965, "auxiliary_loss_mlp": 0.01035591, "balance_loss_clip": 1.02125061, "balance_loss_mlp": 1.02357864, "epoch": 0.3017586051405381, "flos": 21834191796480.0, "grad_norm": 2.172286130774456, "language_loss": 0.7911346, "learning_rate": 3.1669954877754677e-06, "loss": 0.81225014, "num_input_tokens_seen": 107925650, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.5234375, "step": 5019, "time_per_iteration": 2.382206439971924 }, { "auxiliary_loss_clip": 0.01081606, "auxiliary_loss_mlp": 0.0103845, "balance_loss_clip": 1.02221918, "balance_loss_mlp": 1.0261519, "epoch": 0.30181872839320606, "flos": 22159407911040.0, "grad_norm": 2.5271579119668712, "language_loss": 0.69893324, "learning_rate": 3.1666886318650206e-06, "loss": 0.72013378, "num_input_tokens_seen": 107943975, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.5546875, "step": 5020, "time_per_iteration": 2.383802890777588 }, { "auxiliary_loss_clip": 0.01082946, "auxiliary_loss_mlp": 0.01038481, "balance_loss_clip": 1.02095699, "balance_loss_mlp": 1.02527726, "epoch": 0.301878851645874, "flos": 18113183602560.0, "grad_norm": 1.9021581714737108, "language_loss": 0.78609538, "learning_rate": 3.1663817343184576e-06, "loss": 0.80730963, "num_input_tokens_seen": 107962950, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.578125, "step": 5021, "time_per_iteration": 2.3886027336120605 }, { "auxiliary_loss_clip": 0.01077947, "auxiliary_loss_mlp": 0.01025726, "balance_loss_clip": 1.01038384, "balance_loss_mlp": 1.02288127, "epoch": 0.301938974898542, "flos": 17601287114880.0, "grad_norm": 2.2355073510638843, "language_loss": 0.75984716, "learning_rate": 3.166074795146731e-06, "loss": 0.78088391, "num_input_tokens_seen": 107979700, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.55078125, "step": 5022, "time_per_iteration": 2.3653833866119385 }, { "auxiliary_loss_clip": 0.01079659, "auxiliary_loss_mlp": 0.01038496, "balance_loss_clip": 1.0213654, "balance_loss_mlp": 1.02455592, "epoch": 0.30199909815120995, "flos": 11180445350400.0, "grad_norm": 1.7892522293300075, "language_loss": 0.69580376, "learning_rate": 3.1657678143607943e-06, "loss": 0.71698534, "num_input_tokens_seen": 107996645, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.55078125, "step": 5023, "time_per_iteration": 2.3766841888427734 }, { "auxiliary_loss_clip": 0.01082345, "auxiliary_loss_mlp": 0.01030614, "balance_loss_clip": 1.01438963, "balance_loss_mlp": 1.02483201, "epoch": 0.302059221403878, "flos": 21906776246400.0, "grad_norm": 1.9249216083430312, "language_loss": 0.71643651, "learning_rate": 3.165460791971603e-06, "loss": 0.73756611, "num_input_tokens_seen": 108015020, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.57421875, "step": 5024, "time_per_iteration": 2.4116077423095703 }, { "auxiliary_loss_clip": 0.01079141, "auxiliary_loss_mlp": 0.01034678, "balance_loss_clip": 1.01878715, "balance_loss_mlp": 1.02567625, "epoch": 0.30211934465654594, "flos": 26395175324160.0, "grad_norm": 1.7100408121435982, "language_loss": 0.74213982, "learning_rate": 3.1651537279901135e-06, "loss": 0.76327801, "num_input_tokens_seen": 108036430, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.53515625, "step": 5025, "time_per_iteration": 2.4468863010406494 }, { "auxiliary_loss_clip": 0.01077206, "auxiliary_loss_mlp": 0.01030976, "balance_loss_clip": 1.01613474, "balance_loss_mlp": 1.02416253, "epoch": 0.3021794679092139, "flos": 23399453047680.0, "grad_norm": 1.6971143565945075, "language_loss": 0.67114282, "learning_rate": 3.1648466224272854e-06, "loss": 0.69222462, "num_input_tokens_seen": 108054250, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.53125, "step": 5026, "time_per_iteration": 2.3966238498687744 }, { "auxiliary_loss_clip": 0.01079268, "auxiliary_loss_mlp": 0.01031486, "balance_loss_clip": 1.01547635, "balance_loss_mlp": 1.02524018, "epoch": 0.30223959116188187, "flos": 20260097971200.0, "grad_norm": 1.7340964198932574, "language_loss": 0.85200661, "learning_rate": 3.1645394752940772e-06, "loss": 0.87311411, "num_input_tokens_seen": 108071495, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5390625, "step": 5027, "time_per_iteration": 2.395587921142578 }, { "auxiliary_loss_clip": 0.01075981, "auxiliary_loss_mlp": 0.0103182, "balance_loss_clip": 1.01678801, "balance_loss_mlp": 1.02292156, "epoch": 0.30229971441454984, "flos": 26686630287360.0, "grad_norm": 1.591699639119321, "language_loss": 0.7846691, "learning_rate": 3.164232286601451e-06, "loss": 0.80574709, "num_input_tokens_seen": 108092135, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.53125, "step": 5028, "time_per_iteration": 2.462153196334839 }, { "auxiliary_loss_clip": 0.01081643, "auxiliary_loss_mlp": 0.01032792, "balance_loss_clip": 1.01566148, "balance_loss_mlp": 1.02629507, "epoch": 0.3023598376672178, "flos": 34344026052480.0, "grad_norm": 1.73540489182245, "language_loss": 0.77354872, "learning_rate": 3.1639250563603686e-06, "loss": 0.79469311, "num_input_tokens_seen": 108112945, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.5546875, "step": 5029, "time_per_iteration": 2.4995901584625244 }, { "auxiliary_loss_clip": 0.0108169, "auxiliary_loss_mlp": 0.01030893, "balance_loss_clip": 1.01339316, "balance_loss_mlp": 1.02492809, "epoch": 0.30241996091988577, "flos": 23111035372800.0, "grad_norm": 2.2703102274881046, "language_loss": 0.82117456, "learning_rate": 3.1636177845817954e-06, "loss": 0.84230042, "num_input_tokens_seen": 108130325, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.5703125, "step": 5030, "time_per_iteration": 2.385838031768799 }, { "auxiliary_loss_clip": 0.01080079, "auxiliary_loss_mlp": 0.01032591, "balance_loss_clip": 1.01592588, "balance_loss_mlp": 1.0241152, "epoch": 0.30248008417255373, "flos": 19389014749440.0, "grad_norm": 1.65919876176385, "language_loss": 0.69756085, "learning_rate": 3.1633104712766967e-06, "loss": 0.71868753, "num_input_tokens_seen": 108150300, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.55859375, "step": 5031, "time_per_iteration": 2.3964173793792725 }, { "auxiliary_loss_clip": 0.01078944, "auxiliary_loss_mlp": 0.01031782, "balance_loss_clip": 1.01582026, "balance_loss_mlp": 1.02500558, "epoch": 0.3025402074252217, "flos": 23768554608000.0, "grad_norm": 1.70541486126613, "language_loss": 0.82383776, "learning_rate": 3.1630031164560395e-06, "loss": 0.84494501, "num_input_tokens_seen": 108170330, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5390625, "step": 5032, "time_per_iteration": 3.774395227432251 }, { "auxiliary_loss_clip": 0.01088144, "auxiliary_loss_mlp": 0.01041199, "balance_loss_clip": 1.0226146, "balance_loss_mlp": 1.0278399, "epoch": 0.30260033067788966, "flos": 25992941016960.0, "grad_norm": 2.3848223321526363, "language_loss": 0.73535711, "learning_rate": 3.162695720130793e-06, "loss": 0.75665057, "num_input_tokens_seen": 108191265, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.6015625, "step": 5033, "time_per_iteration": 2.437077045440674 }, { "auxiliary_loss_clip": 0.01078681, "auxiliary_loss_mlp": 0.01038563, "balance_loss_clip": 1.02126586, "balance_loss_mlp": 1.02415955, "epoch": 0.3026604539305576, "flos": 25373372296320.0, "grad_norm": 2.0424456423279036, "language_loss": 0.73978257, "learning_rate": 3.1623882823119267e-06, "loss": 0.76095498, "num_input_tokens_seen": 108211615, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.546875, "step": 5034, "time_per_iteration": 2.4297237396240234 }, { "auxiliary_loss_clip": 0.0107787, "auxiliary_loss_mlp": 0.01030216, "balance_loss_clip": 1.01390791, "balance_loss_mlp": 1.02372336, "epoch": 0.3027205771832256, "flos": 25811532259200.0, "grad_norm": 2.0084966306521923, "language_loss": 0.72237194, "learning_rate": 3.1620808030104127e-06, "loss": 0.74345273, "num_input_tokens_seen": 108231080, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.5390625, "step": 5035, "time_per_iteration": 3.821385383605957 }, { "auxiliary_loss_clip": 0.01078289, "auxiliary_loss_mlp": 0.01033219, "balance_loss_clip": 1.01765025, "balance_loss_mlp": 1.02332306, "epoch": 0.30278070043589356, "flos": 27343311649920.0, "grad_norm": 1.9518835134075647, "language_loss": 0.8750509, "learning_rate": 3.1617732822372237e-06, "loss": 0.89616603, "num_input_tokens_seen": 108251125, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.55078125, "step": 5036, "time_per_iteration": 2.4213244915008545 }, { "auxiliary_loss_clip": 0.01078881, "auxiliary_loss_mlp": 0.01034763, "balance_loss_clip": 1.01733446, "balance_loss_mlp": 1.02524662, "epoch": 0.3028408236885616, "flos": 24785190754560.0, "grad_norm": 1.4363712775314645, "language_loss": 0.77089381, "learning_rate": 3.1614657200033355e-06, "loss": 0.79203027, "num_input_tokens_seen": 108272545, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.53515625, "step": 5037, "time_per_iteration": 2.412492513656616 }, { "auxiliary_loss_clip": 0.01082724, "auxiliary_loss_mlp": 0.01036484, "balance_loss_clip": 1.01900804, "balance_loss_mlp": 1.02593899, "epoch": 0.30290094694122954, "flos": 12931653836160.0, "grad_norm": 2.685255131767566, "language_loss": 0.77590311, "learning_rate": 3.1611581163197228e-06, "loss": 0.79709518, "num_input_tokens_seen": 108289725, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.56640625, "step": 5038, "time_per_iteration": 3.7611536979675293 }, { "auxiliary_loss_clip": 0.01078642, "auxiliary_loss_mlp": 0.01029598, "balance_loss_clip": 1.01459527, "balance_loss_mlp": 1.02628255, "epoch": 0.3029610701938975, "flos": 25915399153920.0, "grad_norm": 1.8560384166378767, "language_loss": 0.73723853, "learning_rate": 3.160850471197364e-06, "loss": 0.75832093, "num_input_tokens_seen": 108310690, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.5234375, "step": 5039, "time_per_iteration": 2.4268336296081543 }, { "auxiliary_loss_clip": 0.01075924, "auxiliary_loss_mlp": 0.0103182, "balance_loss_clip": 1.01786065, "balance_loss_mlp": 1.02448809, "epoch": 0.3030211934465655, "flos": 21979919278080.0, "grad_norm": 1.857112120650593, "language_loss": 0.8008182, "learning_rate": 3.160542784647238e-06, "loss": 0.82189566, "num_input_tokens_seen": 108328905, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.515625, "step": 5040, "time_per_iteration": 2.3754000663757324 }, { "auxiliary_loss_clip": 0.01079966, "auxiliary_loss_mlp": 0.01031537, "balance_loss_clip": 1.01614738, "balance_loss_mlp": 1.0265739, "epoch": 0.30308131669923344, "flos": 20991039528960.0, "grad_norm": 1.54185585551736, "language_loss": 0.81566, "learning_rate": 3.1602350566803254e-06, "loss": 0.836775, "num_input_tokens_seen": 108346680, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.53125, "step": 5041, "time_per_iteration": 2.394514322280884 }, { "auxiliary_loss_clip": 0.01017723, "auxiliary_loss_mlp": 0.01017415, "balance_loss_clip": 1.01566231, "balance_loss_mlp": 1.00561452, "epoch": 0.3031414399519014, "flos": 60545641599360.0, "grad_norm": 0.7632627054647441, "language_loss": 0.59425414, "learning_rate": 3.1599272873076076e-06, "loss": 0.61460555, "num_input_tokens_seen": 108413885, "router_z_loss_clip": 0.01757812, "router_z_loss_mlp": 0.12109375, "step": 5042, "time_per_iteration": 4.497772932052612 }, { "auxiliary_loss_clip": 0.01081884, "auxiliary_loss_mlp": 0.01028986, "balance_loss_clip": 1.01265407, "balance_loss_mlp": 1.02627993, "epoch": 0.30320156320456937, "flos": 21906601689600.0, "grad_norm": 1.6305844740537034, "language_loss": 0.7135089, "learning_rate": 3.159619476540069e-06, "loss": 0.73461759, "num_input_tokens_seen": 108433640, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.5546875, "step": 5043, "time_per_iteration": 2.3825836181640625 }, { "auxiliary_loss_clip": 0.01077251, "auxiliary_loss_mlp": 0.01031775, "balance_loss_clip": 1.01621795, "balance_loss_mlp": 1.02385306, "epoch": 0.30326168645723733, "flos": 21651700786560.0, "grad_norm": 2.111412253450013, "language_loss": 0.69311726, "learning_rate": 3.1593116243886943e-06, "loss": 0.71420753, "num_input_tokens_seen": 108452640, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.53515625, "step": 5044, "time_per_iteration": 2.3893320560455322 }, { "auxiliary_loss_clip": 0.01076543, "auxiliary_loss_mlp": 0.01030771, "balance_loss_clip": 1.01541722, "balance_loss_mlp": 1.02340984, "epoch": 0.3033218097099053, "flos": 21870222186240.0, "grad_norm": 1.3166370881357141, "language_loss": 0.77194965, "learning_rate": 3.1590037308644695e-06, "loss": 0.79302281, "num_input_tokens_seen": 108472470, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.53125, "step": 5045, "time_per_iteration": 2.3869130611419678 }, { "auxiliary_loss_clip": 0.01078538, "auxiliary_loss_mlp": 0.01037209, "balance_loss_clip": 1.0203464, "balance_loss_mlp": 1.02383971, "epoch": 0.30338193296257326, "flos": 27088480569600.0, "grad_norm": 1.8553203716814064, "language_loss": 0.72408873, "learning_rate": 3.158695795978383e-06, "loss": 0.74524617, "num_input_tokens_seen": 108493025, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.546875, "step": 5046, "time_per_iteration": 2.440122127532959 }, { "auxiliary_loss_clip": 0.01081281, "auxiliary_loss_mlp": 0.01037388, "balance_loss_clip": 1.01970959, "balance_loss_mlp": 1.02511406, "epoch": 0.30344205621524123, "flos": 19533415599360.0, "grad_norm": 3.2137425197330827, "language_loss": 0.80936623, "learning_rate": 3.1583878197414237e-06, "loss": 0.83055288, "num_input_tokens_seen": 108513480, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.5625, "step": 5047, "time_per_iteration": 2.4542839527130127 }, { "auxiliary_loss_clip": 0.01077701, "auxiliary_loss_mlp": 0.01034701, "balance_loss_clip": 1.01958573, "balance_loss_mlp": 1.02419674, "epoch": 0.3035021794679092, "flos": 23909953080960.0, "grad_norm": 1.716016772938192, "language_loss": 0.72124553, "learning_rate": 3.1580798021645833e-06, "loss": 0.74236959, "num_input_tokens_seen": 108533155, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.53515625, "step": 5048, "time_per_iteration": 2.417043924331665 }, { "auxiliary_loss_clip": 0.01078642, "auxiliary_loss_mlp": 0.01033414, "balance_loss_clip": 1.01817906, "balance_loss_mlp": 1.02458811, "epoch": 0.30356230272057716, "flos": 16142685667200.0, "grad_norm": 1.66838836362413, "language_loss": 0.75226957, "learning_rate": 3.157771743258854e-06, "loss": 0.77339017, "num_input_tokens_seen": 108551900, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.54296875, "step": 5049, "time_per_iteration": 2.376274585723877 }, { "auxiliary_loss_clip": 0.0108108, "auxiliary_loss_mlp": 0.01035606, "balance_loss_clip": 1.02032328, "balance_loss_mlp": 1.02525365, "epoch": 0.3036224259732452, "flos": 28913390668800.0, "grad_norm": 1.582238548426699, "language_loss": 0.81829733, "learning_rate": 3.1574636430352287e-06, "loss": 0.83946419, "num_input_tokens_seen": 108574005, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.55859375, "step": 5050, "time_per_iteration": 2.4505910873413086 }, { "auxiliary_loss_clip": 0.01080029, "auxiliary_loss_mlp": 0.01036879, "balance_loss_clip": 1.02098894, "balance_loss_mlp": 1.02524436, "epoch": 0.30368254922591315, "flos": 21104541959040.0, "grad_norm": 2.6646821233862417, "language_loss": 0.74094534, "learning_rate": 3.1571555015047036e-06, "loss": 0.76211441, "num_input_tokens_seen": 108592715, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.546875, "step": 5051, "time_per_iteration": 2.3911256790161133 }, { "auxiliary_loss_clip": 0.01079081, "auxiliary_loss_mlp": 0.01034739, "balance_loss_clip": 1.01973069, "balance_loss_mlp": 1.02367949, "epoch": 0.3037426724785811, "flos": 23001198635520.0, "grad_norm": 1.579753752100834, "language_loss": 0.76945961, "learning_rate": 3.156847318678275e-06, "loss": 0.7905978, "num_input_tokens_seen": 108611770, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.5546875, "step": 5052, "time_per_iteration": 2.4134862422943115 }, { "auxiliary_loss_clip": 0.01081814, "auxiliary_loss_mlp": 0.01034971, "balance_loss_clip": 1.01846075, "balance_loss_mlp": 1.02560735, "epoch": 0.3038027957312491, "flos": 15631801608960.0, "grad_norm": 1.9089504797738561, "language_loss": 0.82618737, "learning_rate": 3.156539094566941e-06, "loss": 0.84735525, "num_input_tokens_seen": 108629070, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.5625, "step": 5053, "time_per_iteration": 2.367910623550415 }, { "auxiliary_loss_clip": 0.01079904, "auxiliary_loss_mlp": 0.01032492, "balance_loss_clip": 1.01688755, "balance_loss_mlp": 1.02545893, "epoch": 0.30386291898391704, "flos": 12713167347840.0, "grad_norm": 1.540145886429998, "language_loss": 0.71253401, "learning_rate": 3.1562308291817024e-06, "loss": 0.73365796, "num_input_tokens_seen": 108646315, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.54296875, "step": 5054, "time_per_iteration": 2.375654935836792 }, { "auxiliary_loss_clip": 0.01080617, "auxiliary_loss_mlp": 0.01031856, "balance_loss_clip": 1.01582265, "balance_loss_mlp": 1.0258913, "epoch": 0.303923042236585, "flos": 26358237239040.0, "grad_norm": 1.85087735015146, "language_loss": 0.69822037, "learning_rate": 3.15592252253356e-06, "loss": 0.71934509, "num_input_tokens_seen": 108665920, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.546875, "step": 5055, "time_per_iteration": 2.4142305850982666 }, { "auxiliary_loss_clip": 0.01080944, "auxiliary_loss_mlp": 0.01031188, "balance_loss_clip": 1.01447487, "balance_loss_mlp": 1.02540255, "epoch": 0.30398316548925297, "flos": 19718210759040.0, "grad_norm": 1.6823764906449163, "language_loss": 0.67452139, "learning_rate": 3.1556141746335153e-06, "loss": 0.69564271, "num_input_tokens_seen": 108683485, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.5546875, "step": 5056, "time_per_iteration": 2.3767662048339844 }, { "auxiliary_loss_clip": 0.01079828, "auxiliary_loss_mlp": 0.0103147, "balance_loss_clip": 1.0155319, "balance_loss_mlp": 1.02489007, "epoch": 0.30404328874192094, "flos": 24238799976960.0, "grad_norm": 1.6025181443183105, "language_loss": 0.82537723, "learning_rate": 3.155305785492574e-06, "loss": 0.8464902, "num_input_tokens_seen": 108702700, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.55078125, "step": 5057, "time_per_iteration": 2.3970909118652344 }, { "auxiliary_loss_clip": 0.01077873, "auxiliary_loss_mlp": 0.01029556, "balance_loss_clip": 1.01315892, "balance_loss_mlp": 1.0239892, "epoch": 0.3041034119945889, "flos": 24497785509120.0, "grad_norm": 1.6665320149532368, "language_loss": 0.88529772, "learning_rate": 3.1549973551217408e-06, "loss": 0.90637207, "num_input_tokens_seen": 108721860, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.5390625, "step": 5058, "time_per_iteration": 2.4081411361694336 }, { "auxiliary_loss_clip": 0.01080462, "auxiliary_loss_mlp": 0.01032894, "balance_loss_clip": 1.01594806, "balance_loss_mlp": 1.02553666, "epoch": 0.30416353524725687, "flos": 28287747371520.0, "grad_norm": 2.058088554296903, "language_loss": 0.71862209, "learning_rate": 3.1546888835320227e-06, "loss": 0.73975563, "num_input_tokens_seen": 108743215, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.55078125, "step": 5059, "time_per_iteration": 2.4287495613098145 }, { "auxiliary_loss_clip": 0.01079415, "auxiliary_loss_mlp": 0.01033589, "balance_loss_clip": 1.01785302, "balance_loss_mlp": 1.02515721, "epoch": 0.30422365849992483, "flos": 23659241541120.0, "grad_norm": 1.5643683043373413, "language_loss": 0.72890973, "learning_rate": 3.1543803707344284e-06, "loss": 0.7500397, "num_input_tokens_seen": 108765505, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.5390625, "step": 5060, "time_per_iteration": 2.5202770233154297 }, { "auxiliary_loss_clip": 0.01079598, "auxiliary_loss_mlp": 0.01031143, "balance_loss_clip": 1.01508582, "balance_loss_mlp": 1.02489781, "epoch": 0.3042837817525928, "flos": 22997777322240.0, "grad_norm": 1.9526716156198396, "language_loss": 0.769113, "learning_rate": 3.154071816739969e-06, "loss": 0.79022038, "num_input_tokens_seen": 108783370, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.546875, "step": 5061, "time_per_iteration": 2.3840816020965576 }, { "auxiliary_loss_clip": 0.0108126, "auxiliary_loss_mlp": 0.01033333, "balance_loss_clip": 1.01638198, "balance_loss_mlp": 1.02594531, "epoch": 0.30434390500526076, "flos": 22081482023040.0, "grad_norm": 3.0975895023617093, "language_loss": 0.81807518, "learning_rate": 3.1537632215596542e-06, "loss": 0.83922112, "num_input_tokens_seen": 108797430, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.5546875, "step": 5062, "time_per_iteration": 2.3743505477905273 }, { "auxiliary_loss_clip": 0.01075614, "auxiliary_loss_mlp": 0.01031131, "balance_loss_clip": 1.01559782, "balance_loss_mlp": 1.02252674, "epoch": 0.3044040282579287, "flos": 25482336249600.0, "grad_norm": 1.8621036098082937, "language_loss": 0.74603331, "learning_rate": 3.153454585204498e-06, "loss": 0.76710081, "num_input_tokens_seen": 108816945, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.53125, "step": 5063, "time_per_iteration": 2.4113378524780273 }, { "auxiliary_loss_clip": 0.01078599, "auxiliary_loss_mlp": 0.01033616, "balance_loss_clip": 1.01596713, "balance_loss_mlp": 1.02499104, "epoch": 0.30446415151059675, "flos": 21944377647360.0, "grad_norm": 2.0935570233285534, "language_loss": 0.84081364, "learning_rate": 3.153145907685515e-06, "loss": 0.86193573, "num_input_tokens_seen": 108836615, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.53515625, "step": 5064, "time_per_iteration": 2.3820180892944336 }, { "auxiliary_loss_clip": 0.010801, "auxiliary_loss_mlp": 0.01033188, "balance_loss_clip": 1.01652265, "balance_loss_mlp": 1.02495968, "epoch": 0.3045242747632647, "flos": 16434489744000.0, "grad_norm": 2.1758007655381744, "language_loss": 0.75466955, "learning_rate": 3.152837189013721e-06, "loss": 0.77580249, "num_input_tokens_seen": 108855165, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.5546875, "step": 5065, "time_per_iteration": 2.3753256797790527 }, { "auxiliary_loss_clip": 0.01081433, "auxiliary_loss_mlp": 0.01033266, "balance_loss_clip": 1.016541, "balance_loss_mlp": 1.02405083, "epoch": 0.3045843980159327, "flos": 31538998955520.0, "grad_norm": 2.3023132129367916, "language_loss": 0.61482322, "learning_rate": 3.1525284292001323e-06, "loss": 0.63597018, "num_input_tokens_seen": 108874690, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.57421875, "step": 5066, "time_per_iteration": 2.466453790664673 }, { "auxiliary_loss_clip": 0.01086782, "auxiliary_loss_mlp": 0.01036319, "balance_loss_clip": 1.01843762, "balance_loss_mlp": 1.02784896, "epoch": 0.30464452126860064, "flos": 17852801616000.0, "grad_norm": 2.304756466861215, "language_loss": 0.82674682, "learning_rate": 3.1522196282557698e-06, "loss": 0.84797782, "num_input_tokens_seen": 108893140, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.58984375, "step": 5067, "time_per_iteration": 2.3969404697418213 }, { "auxiliary_loss_clip": 0.01078082, "auxiliary_loss_mlp": 0.0103114, "balance_loss_clip": 1.01597667, "balance_loss_mlp": 1.02457166, "epoch": 0.3047046445212686, "flos": 20630351606400.0, "grad_norm": 1.9227187056768802, "language_loss": 0.63247436, "learning_rate": 3.1519107861916516e-06, "loss": 0.6535666, "num_input_tokens_seen": 108911880, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.53515625, "step": 5068, "time_per_iteration": 2.4060232639312744 }, { "auxiliary_loss_clip": 0.01077721, "auxiliary_loss_mlp": 0.01028563, "balance_loss_clip": 1.01345301, "balance_loss_mlp": 1.02360404, "epoch": 0.3047647677739366, "flos": 21286544209920.0, "grad_norm": 1.8648866684163907, "language_loss": 0.75149035, "learning_rate": 3.151601903018801e-06, "loss": 0.77255321, "num_input_tokens_seen": 108930440, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.5390625, "step": 5069, "time_per_iteration": 2.385392665863037 }, { "auxiliary_loss_clip": 0.01076768, "auxiliary_loss_mlp": 0.01035412, "balance_loss_clip": 1.01986074, "balance_loss_mlp": 1.02529395, "epoch": 0.30482489102660454, "flos": 20994879778560.0, "grad_norm": 1.9300514163373805, "language_loss": 0.7524488, "learning_rate": 3.1512929787482405e-06, "loss": 0.77357066, "num_input_tokens_seen": 108949125, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.515625, "step": 5070, "time_per_iteration": 2.3724310398101807 }, { "auxiliary_loss_clip": 0.01082619, "auxiliary_loss_mlp": 0.01033286, "balance_loss_clip": 1.01584578, "balance_loss_mlp": 1.02601182, "epoch": 0.3048850142792725, "flos": 26289493038720.0, "grad_norm": 1.8750056174865726, "language_loss": 0.81683183, "learning_rate": 3.150984013390995e-06, "loss": 0.83799082, "num_input_tokens_seen": 108972190, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.56640625, "step": 5071, "time_per_iteration": 2.447324275970459 }, { "auxiliary_loss_clip": 0.01080769, "auxiliary_loss_mlp": 0.01034315, "balance_loss_clip": 1.01735187, "balance_loss_mlp": 1.02393794, "epoch": 0.30494513753194047, "flos": 22345145677440.0, "grad_norm": 4.689007473164586, "language_loss": 0.75940239, "learning_rate": 3.1506750069580916e-06, "loss": 0.78055316, "num_input_tokens_seen": 108990325, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.5703125, "step": 5072, "time_per_iteration": 3.8788299560546875 }, { "auxiliary_loss_clip": 0.01078904, "auxiliary_loss_mlp": 0.01036576, "balance_loss_clip": 1.02013671, "balance_loss_mlp": 1.02535915, "epoch": 0.30500526078460843, "flos": 19536627444480.0, "grad_norm": 1.8412488787990688, "language_loss": 0.71410644, "learning_rate": 3.150365959460556e-06, "loss": 0.7352612, "num_input_tokens_seen": 109009505, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.53515625, "step": 5073, "time_per_iteration": 2.433459997177124 }, { "auxiliary_loss_clip": 0.01083685, "auxiliary_loss_mlp": 0.01035263, "balance_loss_clip": 1.01834738, "balance_loss_mlp": 1.02588344, "epoch": 0.3050653840372764, "flos": 14464445656320.0, "grad_norm": 2.3076210393906336, "language_loss": 0.76898336, "learning_rate": 3.150056870909419e-06, "loss": 0.79017282, "num_input_tokens_seen": 109026350, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.578125, "step": 5074, "time_per_iteration": 2.36505126953125 }, { "auxiliary_loss_clip": 0.01079156, "auxiliary_loss_mlp": 0.01035008, "balance_loss_clip": 1.01960599, "balance_loss_mlp": 1.02561998, "epoch": 0.30512550728994436, "flos": 24242640226560.0, "grad_norm": 1.8174893898110396, "language_loss": 0.74389237, "learning_rate": 3.1497477413157107e-06, "loss": 0.76503408, "num_input_tokens_seen": 109044165, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.53515625, "step": 5075, "time_per_iteration": 3.7757163047790527 }, { "auxiliary_loss_clip": 0.01082864, "auxiliary_loss_mlp": 0.01033601, "balance_loss_clip": 1.01577902, "balance_loss_mlp": 1.02590251, "epoch": 0.30518563054261233, "flos": 16359670967040.0, "grad_norm": 1.9048589054866303, "language_loss": 0.75819647, "learning_rate": 3.1494385706904625e-06, "loss": 0.77936113, "num_input_tokens_seen": 109060665, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.5703125, "step": 5076, "time_per_iteration": 2.3512556552886963 }, { "auxiliary_loss_clip": 0.01082248, "auxiliary_loss_mlp": 0.01034118, "balance_loss_clip": 1.01760733, "balance_loss_mlp": 1.02683091, "epoch": 0.30524575379528035, "flos": 21578522843520.0, "grad_norm": 2.3187040054924184, "language_loss": 0.79411626, "learning_rate": 3.149129359044709e-06, "loss": 0.8152799, "num_input_tokens_seen": 109080035, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.5546875, "step": 5077, "time_per_iteration": 3.752131462097168 }, { "auxiliary_loss_clip": 0.01076754, "auxiliary_loss_mlp": 0.01027194, "balance_loss_clip": 1.01142287, "balance_loss_mlp": 1.02505755, "epoch": 0.3053058770479483, "flos": 16544291569920.0, "grad_norm": 1.8015275940502062, "language_loss": 0.74522299, "learning_rate": 3.148820106389484e-06, "loss": 0.76626241, "num_input_tokens_seen": 109097385, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.515625, "step": 5078, "time_per_iteration": 2.3557209968566895 }, { "auxiliary_loss_clip": 0.01075746, "auxiliary_loss_mlp": 0.01035325, "balance_loss_clip": 1.01947069, "balance_loss_mlp": 1.02324128, "epoch": 0.3053660003006163, "flos": 12312085115520.0, "grad_norm": 2.0557977823221494, "language_loss": 0.66679752, "learning_rate": 3.1485108127358246e-06, "loss": 0.68790823, "num_input_tokens_seen": 109115495, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.5234375, "step": 5079, "time_per_iteration": 2.3630056381225586 }, { "auxiliary_loss_clip": 0.01079402, "auxiliary_loss_mlp": 0.01033911, "balance_loss_clip": 1.01822281, "balance_loss_mlp": 1.0249331, "epoch": 0.30542612355328425, "flos": 23111175018240.0, "grad_norm": 2.3169130925395263, "language_loss": 0.79645264, "learning_rate": 3.1482014780947693e-06, "loss": 0.81758577, "num_input_tokens_seen": 109134235, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.546875, "step": 5080, "time_per_iteration": 2.3906824588775635 }, { "auxiliary_loss_clip": 0.01078595, "auxiliary_loss_mlp": 0.01038239, "balance_loss_clip": 1.02320099, "balance_loss_mlp": 1.02523494, "epoch": 0.3054862468059522, "flos": 24388297885440.0, "grad_norm": 2.1723114008830637, "language_loss": 0.80782568, "learning_rate": 3.147892102477356e-06, "loss": 0.82899398, "num_input_tokens_seen": 109152760, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.53125, "step": 5081, "time_per_iteration": 3.762012004852295 }, { "auxiliary_loss_clip": 0.01077588, "auxiliary_loss_mlp": 0.01034157, "balance_loss_clip": 1.01792085, "balance_loss_mlp": 1.0235467, "epoch": 0.3055463700586202, "flos": 29384857935360.0, "grad_norm": 1.8994625477612541, "language_loss": 0.72201818, "learning_rate": 3.147582685894627e-06, "loss": 0.74313563, "num_input_tokens_seen": 109173925, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.5390625, "step": 5082, "time_per_iteration": 2.432049512863159 }, { "auxiliary_loss_clip": 0.010815, "auxiliary_loss_mlp": 0.01032585, "balance_loss_clip": 1.01512098, "balance_loss_mlp": 1.02457976, "epoch": 0.30560649331128814, "flos": 25590636887040.0, "grad_norm": 1.869577299281497, "language_loss": 0.73351568, "learning_rate": 3.1472732283576226e-06, "loss": 0.75465655, "num_input_tokens_seen": 109192510, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.56640625, "step": 5083, "time_per_iteration": 2.410402297973633 }, { "auxiliary_loss_clip": 0.01079138, "auxiliary_loss_mlp": 0.01028757, "balance_loss_clip": 1.01243687, "balance_loss_mlp": 1.0233649, "epoch": 0.3056666165639561, "flos": 19127515599360.0, "grad_norm": 1.8073700407937934, "language_loss": 0.71143746, "learning_rate": 3.146963729877389e-06, "loss": 0.73251641, "num_input_tokens_seen": 109210885, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.55859375, "step": 5084, "time_per_iteration": 2.3652381896972656 }, { "auxiliary_loss_clip": 0.01080707, "auxiliary_loss_mlp": 0.01030032, "balance_loss_clip": 1.01336694, "balance_loss_mlp": 1.02467549, "epoch": 0.30572673981662407, "flos": 15522942389760.0, "grad_norm": 1.742273901997806, "language_loss": 0.78633082, "learning_rate": 3.1466541904649698e-06, "loss": 0.80743825, "num_input_tokens_seen": 109229180, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.5625, "step": 5085, "time_per_iteration": 2.380459785461426 }, { "auxiliary_loss_clip": 0.01075721, "auxiliary_loss_mlp": 0.01030383, "balance_loss_clip": 1.01443315, "balance_loss_mlp": 1.02337861, "epoch": 0.30578686306929204, "flos": 21505484545920.0, "grad_norm": 2.047968027875394, "language_loss": 0.78112531, "learning_rate": 3.1463446101314118e-06, "loss": 0.80218637, "num_input_tokens_seen": 109249510, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5234375, "step": 5086, "time_per_iteration": 2.3850786685943604 }, { "auxiliary_loss_clip": 0.01078415, "auxiliary_loss_mlp": 0.01034493, "balance_loss_clip": 1.01820946, "balance_loss_mlp": 1.02396393, "epoch": 0.30584698632196, "flos": 20953368305280.0, "grad_norm": 1.8084602686651856, "language_loss": 0.76741385, "learning_rate": 3.1460349888877645e-06, "loss": 0.78854293, "num_input_tokens_seen": 109268200, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.546875, "step": 5087, "time_per_iteration": 2.3894331455230713 }, { "auxiliary_loss_clip": 0.01080381, "auxiliary_loss_mlp": 0.0103147, "balance_loss_clip": 1.01379085, "balance_loss_mlp": 1.0240941, "epoch": 0.30590710957462797, "flos": 24679962316800.0, "grad_norm": 2.484657609352947, "language_loss": 0.72525501, "learning_rate": 3.1457253267450756e-06, "loss": 0.74637347, "num_input_tokens_seen": 109288370, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.5625, "step": 5088, "time_per_iteration": 2.4178121089935303 }, { "auxiliary_loss_clip": 0.01081955, "auxiliary_loss_mlp": 0.01032452, "balance_loss_clip": 1.01588249, "balance_loss_mlp": 1.02458477, "epoch": 0.30596723282729593, "flos": 17086108959360.0, "grad_norm": 2.226747048899952, "language_loss": 0.79293025, "learning_rate": 3.145415623714397e-06, "loss": 0.81407434, "num_input_tokens_seen": 109306730, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.5703125, "step": 5089, "time_per_iteration": 2.373013496398926 }, { "auxiliary_loss_clip": 0.01078708, "auxiliary_loss_mlp": 0.01035366, "balance_loss_clip": 1.01964855, "balance_loss_mlp": 1.02459717, "epoch": 0.30602735607996395, "flos": 22855994824320.0, "grad_norm": 1.6334420631479762, "language_loss": 0.76835668, "learning_rate": 3.145105879806781e-06, "loss": 0.78949744, "num_input_tokens_seen": 109327360, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.5390625, "step": 5090, "time_per_iteration": 2.3933351039886475 }, { "auxiliary_loss_clip": 0.01084782, "auxiliary_loss_mlp": 0.01038957, "balance_loss_clip": 1.0215404, "balance_loss_mlp": 1.02664232, "epoch": 0.3060874793326319, "flos": 29860200362880.0, "grad_norm": 1.789059172379312, "language_loss": 0.76110858, "learning_rate": 3.144796095033282e-06, "loss": 0.78234595, "num_input_tokens_seen": 109348135, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.58203125, "step": 5091, "time_per_iteration": 2.446603775024414 }, { "auxiliary_loss_clip": 0.01082083, "auxiliary_loss_mlp": 0.01032028, "balance_loss_clip": 1.01585746, "balance_loss_mlp": 1.02568531, "epoch": 0.3061476025852999, "flos": 20447546394240.0, "grad_norm": 4.361862532620491, "language_loss": 0.71736872, "learning_rate": 3.1444862694049548e-06, "loss": 0.73850983, "num_input_tokens_seen": 109366220, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.56640625, "step": 5092, "time_per_iteration": 2.392932891845703 }, { "auxiliary_loss_clip": 0.01077351, "auxiliary_loss_mlp": 0.01030519, "balance_loss_clip": 1.01486683, "balance_loss_mlp": 1.02375352, "epoch": 0.30620772583796785, "flos": 19390446115200.0, "grad_norm": 2.0449167330303504, "language_loss": 0.82672793, "learning_rate": 3.144176402932857e-06, "loss": 0.84780663, "num_input_tokens_seen": 109385260, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.5390625, "step": 5093, "time_per_iteration": 2.3958449363708496 }, { "auxiliary_loss_clip": 0.01079465, "auxiliary_loss_mlp": 0.01033301, "balance_loss_clip": 1.01837611, "balance_loss_mlp": 1.02562737, "epoch": 0.3062678490906358, "flos": 24023420599680.0, "grad_norm": 1.7334995034031653, "language_loss": 0.74549633, "learning_rate": 3.143866495628046e-06, "loss": 0.76662397, "num_input_tokens_seen": 109405025, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.5390625, "step": 5094, "time_per_iteration": 2.4137465953826904 }, { "auxiliary_loss_clip": 0.0107656, "auxiliary_loss_mlp": 0.01031346, "balance_loss_clip": 1.01611078, "balance_loss_mlp": 1.02357149, "epoch": 0.3063279723433038, "flos": 19753647655680.0, "grad_norm": 1.8931123445568476, "language_loss": 0.75921643, "learning_rate": 3.1435565475015827e-06, "loss": 0.78029549, "num_input_tokens_seen": 109422465, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.53125, "step": 5095, "time_per_iteration": 2.3703372478485107 }, { "auxiliary_loss_clip": 0.01076011, "auxiliary_loss_mlp": 0.01032305, "balance_loss_clip": 1.01668847, "balance_loss_mlp": 1.02395868, "epoch": 0.30638809559597174, "flos": 22449082394880.0, "grad_norm": 2.143575018854079, "language_loss": 0.80498981, "learning_rate": 3.143246558564528e-06, "loss": 0.82607299, "num_input_tokens_seen": 109440575, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.51953125, "step": 5096, "time_per_iteration": 2.387840509414673 }, { "auxiliary_loss_clip": 0.01076872, "auxiliary_loss_mlp": 0.01029607, "balance_loss_clip": 1.0137099, "balance_loss_mlp": 1.02281272, "epoch": 0.3064482188486397, "flos": 17164209404160.0, "grad_norm": 3.3072272142460304, "language_loss": 0.8172816, "learning_rate": 3.1429365288279437e-06, "loss": 0.83834636, "num_input_tokens_seen": 109459050, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.5390625, "step": 5097, "time_per_iteration": 2.352224826812744 }, { "auxiliary_loss_clip": 0.0107899, "auxiliary_loss_mlp": 0.01035665, "balance_loss_clip": 1.01889229, "balance_loss_mlp": 1.02406096, "epoch": 0.3065083421013077, "flos": 23767367621760.0, "grad_norm": 2.10298827540818, "language_loss": 0.78092313, "learning_rate": 3.142626458302895e-06, "loss": 0.80206966, "num_input_tokens_seen": 109475860, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.546875, "step": 5098, "time_per_iteration": 2.3999123573303223 }, { "auxiliary_loss_clip": 0.01077848, "auxiliary_loss_mlp": 0.0102936, "balance_loss_clip": 1.01379156, "balance_loss_mlp": 1.02454019, "epoch": 0.30656846535397564, "flos": 26430647132160.0, "grad_norm": 1.7533503981649752, "language_loss": 0.84115088, "learning_rate": 3.1423163470004473e-06, "loss": 0.86222291, "num_input_tokens_seen": 109494760, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.53125, "step": 5099, "time_per_iteration": 2.4063992500305176 }, { "auxiliary_loss_clip": 0.01079286, "auxiliary_loss_mlp": 0.01036687, "balance_loss_clip": 1.02019453, "balance_loss_mlp": 1.02415967, "epoch": 0.3066285886066436, "flos": 26650564986240.0, "grad_norm": 1.5899167638724356, "language_loss": 0.8568939, "learning_rate": 3.1420061949316676e-06, "loss": 0.87805367, "num_input_tokens_seen": 109516480, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.55078125, "step": 5100, "time_per_iteration": 2.4158122539520264 }, { "auxiliary_loss_clip": 0.01076121, "auxiliary_loss_mlp": 0.01030898, "balance_loss_clip": 1.01454246, "balance_loss_mlp": 1.02231455, "epoch": 0.30668871185931157, "flos": 15049031328000.0, "grad_norm": 1.888937394141001, "language_loss": 0.79053181, "learning_rate": 3.141696002107624e-06, "loss": 0.81160194, "num_input_tokens_seen": 109534615, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.5390625, "step": 5101, "time_per_iteration": 2.359696626663208 }, { "auxiliary_loss_clip": 0.01079749, "auxiliary_loss_mlp": 0.01038396, "balance_loss_clip": 1.02103889, "balance_loss_mlp": 1.0245868, "epoch": 0.30674883511197953, "flos": 20081133008640.0, "grad_norm": 1.6623602981726566, "language_loss": 0.80277586, "learning_rate": 3.1413857685393873e-06, "loss": 0.82395732, "num_input_tokens_seen": 109554040, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.55078125, "step": 5102, "time_per_iteration": 2.3827927112579346 }, { "auxiliary_loss_clip": 0.01081269, "auxiliary_loss_mlp": 0.01032584, "balance_loss_clip": 1.01548934, "balance_loss_mlp": 1.02632546, "epoch": 0.30680895836464755, "flos": 22892688529920.0, "grad_norm": 1.9847136357214878, "language_loss": 0.88668692, "learning_rate": 3.1410754942380287e-06, "loss": 0.90782547, "num_input_tokens_seen": 109574345, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.55078125, "step": 5103, "time_per_iteration": 2.3850743770599365 }, { "auxiliary_loss_clip": 0.01077684, "auxiliary_loss_mlp": 0.01033489, "balance_loss_clip": 1.01766443, "balance_loss_mlp": 1.02336264, "epoch": 0.3068690816173155, "flos": 23695027551360.0, "grad_norm": 7.294134692163468, "language_loss": 0.7403397, "learning_rate": 3.1407651792146204e-06, "loss": 0.76145148, "num_input_tokens_seen": 109593670, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.5390625, "step": 5104, "time_per_iteration": 2.390892505645752 }, { "auxiliary_loss_clip": 0.01079635, "auxiliary_loss_mlp": 0.01035523, "balance_loss_clip": 1.01887, "balance_loss_mlp": 1.02392173, "epoch": 0.3069292048699835, "flos": 23549893562880.0, "grad_norm": 2.122087033959047, "language_loss": 0.72853553, "learning_rate": 3.1404548234802376e-06, "loss": 0.74968719, "num_input_tokens_seen": 109613385, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.55859375, "step": 5105, "time_per_iteration": 2.3908679485321045 }, { "auxiliary_loss_clip": 0.01081192, "auxiliary_loss_mlp": 0.01033354, "balance_loss_clip": 1.01637292, "balance_loss_mlp": 1.02452767, "epoch": 0.30698932812265145, "flos": 24530604053760.0, "grad_norm": 1.7685048479415502, "language_loss": 0.8725996, "learning_rate": 3.140144427045955e-06, "loss": 0.89374506, "num_input_tokens_seen": 109632395, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.56640625, "step": 5106, "time_per_iteration": 2.426076650619507 }, { "auxiliary_loss_clip": 0.01081552, "auxiliary_loss_mlp": 0.01040172, "balance_loss_clip": 1.02134919, "balance_loss_mlp": 1.02422607, "epoch": 0.3070494513753194, "flos": 20995368537600.0, "grad_norm": 2.4336028654453337, "language_loss": 0.71578121, "learning_rate": 3.1398339899228512e-06, "loss": 0.73699844, "num_input_tokens_seen": 109651380, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.5703125, "step": 5107, "time_per_iteration": 2.3681674003601074 }, { "auxiliary_loss_clip": 0.01077889, "auxiliary_loss_mlp": 0.01036019, "balance_loss_clip": 1.02060497, "balance_loss_mlp": 1.02384579, "epoch": 0.3071095746279874, "flos": 19024940424960.0, "grad_norm": 2.2126667843796572, "language_loss": 0.72114658, "learning_rate": 3.139523512122005e-06, "loss": 0.74228561, "num_input_tokens_seen": 109670240, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.5390625, "step": 5108, "time_per_iteration": 2.372995615005493 }, { "auxiliary_loss_clip": 0.01077898, "auxiliary_loss_mlp": 0.01030447, "balance_loss_clip": 1.01489043, "balance_loss_mlp": 1.02421427, "epoch": 0.30716969788065535, "flos": 21214448519040.0, "grad_norm": 2.075612682080276, "language_loss": 0.85682523, "learning_rate": 3.1392129936544947e-06, "loss": 0.87790871, "num_input_tokens_seen": 109690810, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.5390625, "step": 5109, "time_per_iteration": 2.3912501335144043 }, { "auxiliary_loss_clip": 0.01077607, "auxiliary_loss_mlp": 0.01028635, "balance_loss_clip": 1.01360869, "balance_loss_mlp": 1.02277243, "epoch": 0.3072298211333233, "flos": 25771661619840.0, "grad_norm": 1.6016735539815112, "language_loss": 0.6779108, "learning_rate": 3.1389024345314033e-06, "loss": 0.69897318, "num_input_tokens_seen": 109711145, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.546875, "step": 5110, "time_per_iteration": 2.429046392440796 }, { "auxiliary_loss_clip": 0.01076413, "auxiliary_loss_mlp": 0.0102863, "balance_loss_clip": 1.01363373, "balance_loss_mlp": 1.02355778, "epoch": 0.3072899443859913, "flos": 25847737205760.0, "grad_norm": 1.4264155370175726, "language_loss": 0.7684719, "learning_rate": 3.1385918347638142e-06, "loss": 0.78952235, "num_input_tokens_seen": 109731425, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.52734375, "step": 5111, "time_per_iteration": 2.4254255294799805 }, { "auxiliary_loss_clip": 0.01078433, "auxiliary_loss_mlp": 0.01031635, "balance_loss_clip": 1.01573277, "balance_loss_mlp": 1.02413774, "epoch": 0.30735006763865924, "flos": 25921578464640.0, "grad_norm": 2.7580918109065435, "language_loss": 0.66969657, "learning_rate": 3.1382811943628107e-06, "loss": 0.69079721, "num_input_tokens_seen": 109752720, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.54296875, "step": 5112, "time_per_iteration": 3.832089424133301 }, { "auxiliary_loss_clip": 0.01081321, "auxiliary_loss_mlp": 0.01037913, "balance_loss_clip": 1.02080607, "balance_loss_mlp": 1.02410853, "epoch": 0.3074101908913272, "flos": 30915764542080.0, "grad_norm": 3.249391851251878, "language_loss": 0.79351687, "learning_rate": 3.1379705133394793e-06, "loss": 0.81470919, "num_input_tokens_seen": 109772840, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.5703125, "step": 5113, "time_per_iteration": 2.468846082687378 }, { "auxiliary_loss_clip": 0.01077078, "auxiliary_loss_mlp": 0.01035937, "balance_loss_clip": 1.02079749, "balance_loss_mlp": 1.02224755, "epoch": 0.30747031414399517, "flos": 18400204823040.0, "grad_norm": 2.412408745969876, "language_loss": 0.76614761, "learning_rate": 3.1376597917049084e-06, "loss": 0.7872777, "num_input_tokens_seen": 109790150, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.546875, "step": 5114, "time_per_iteration": 3.7922019958496094 }, { "auxiliary_loss_clip": 0.01080715, "auxiliary_loss_mlp": 0.01030119, "balance_loss_clip": 1.01314354, "balance_loss_mlp": 1.02512193, "epoch": 0.30753043739666314, "flos": 22632201809280.0, "grad_norm": 1.7376376609500717, "language_loss": 0.62275064, "learning_rate": 3.1373490294701853e-06, "loss": 0.64385897, "num_input_tokens_seen": 109807985, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.5546875, "step": 5115, "time_per_iteration": 2.385173797607422 }, { "auxiliary_loss_clip": 0.01080656, "auxiliary_loss_mlp": 0.01028723, "balance_loss_clip": 1.01310694, "balance_loss_mlp": 1.02506423, "epoch": 0.3075905606493311, "flos": 27342857802240.0, "grad_norm": 1.7905529204636104, "language_loss": 0.83087099, "learning_rate": 3.1370382266464007e-06, "loss": 0.85196483, "num_input_tokens_seen": 109825920, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.5546875, "step": 5116, "time_per_iteration": 2.42085862159729 }, { "auxiliary_loss_clip": 0.01076563, "auxiliary_loss_mlp": 0.01037733, "balance_loss_clip": 1.02163982, "balance_loss_mlp": 1.02344823, "epoch": 0.3076506839019991, "flos": 22089721104000.0, "grad_norm": 1.9990810770592031, "language_loss": 0.75865328, "learning_rate": 3.136727383244647e-06, "loss": 0.77979636, "num_input_tokens_seen": 109846220, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.53125, "step": 5117, "time_per_iteration": 3.773503541946411 }, { "auxiliary_loss_clip": 0.01080605, "auxiliary_loss_mlp": 0.01031636, "balance_loss_clip": 1.01538754, "balance_loss_mlp": 1.02455497, "epoch": 0.3077108071546671, "flos": 21288429423360.0, "grad_norm": 2.2087428475027737, "language_loss": 0.71562529, "learning_rate": 3.136416499276017e-06, "loss": 0.73674774, "num_input_tokens_seen": 109863870, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.55859375, "step": 5118, "time_per_iteration": 2.3758347034454346 }, { "auxiliary_loss_clip": 0.01080015, "auxiliary_loss_mlp": 0.01028134, "balance_loss_clip": 1.01220727, "balance_loss_mlp": 1.02541113, "epoch": 0.30777093040733505, "flos": 21430002453120.0, "grad_norm": 1.5744646214494442, "language_loss": 0.74403101, "learning_rate": 3.136105574751605e-06, "loss": 0.76511252, "num_input_tokens_seen": 109883500, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.546875, "step": 5119, "time_per_iteration": 2.3935563564300537 }, { "auxiliary_loss_clip": 0.01082104, "auxiliary_loss_mlp": 0.01043753, "balance_loss_clip": 1.0266521, "balance_loss_mlp": 1.025195, "epoch": 0.307831053660003, "flos": 23148148014720.0, "grad_norm": 9.304577538731047, "language_loss": 0.80417204, "learning_rate": 3.135794609682508e-06, "loss": 0.82543057, "num_input_tokens_seen": 109904620, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.5703125, "step": 5120, "time_per_iteration": 3.8386504650115967 }, { "auxiliary_loss_clip": 0.01076532, "auxiliary_loss_mlp": 0.01036613, "balance_loss_clip": 1.02130711, "balance_loss_mlp": 1.02279353, "epoch": 0.307891176912671, "flos": 17018796124800.0, "grad_norm": 2.262056870766989, "language_loss": 0.80122209, "learning_rate": 3.135483604079823e-06, "loss": 0.8223536, "num_input_tokens_seen": 109922275, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.5390625, "step": 5121, "time_per_iteration": 2.3527040481567383 }, { "auxiliary_loss_clip": 0.0107719, "auxiliary_loss_mlp": 0.01027376, "balance_loss_clip": 1.01165867, "balance_loss_mlp": 1.02393687, "epoch": 0.30795130016533895, "flos": 27703929749760.0, "grad_norm": 1.4726564421949742, "language_loss": 0.8263222, "learning_rate": 3.1351725579546484e-06, "loss": 0.84736788, "num_input_tokens_seen": 109944265, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.53125, "step": 5122, "time_per_iteration": 2.42997407913208 }, { "auxiliary_loss_clip": 0.01019117, "auxiliary_loss_mlp": 0.0101245, "balance_loss_clip": 1.01079261, "balance_loss_mlp": 1.00643384, "epoch": 0.3080114234180069, "flos": 69054987294720.0, "grad_norm": 0.8950464042382977, "language_loss": 0.58587706, "learning_rate": 3.134861471318086e-06, "loss": 0.60619271, "num_input_tokens_seen": 110014160, "router_z_loss_clip": 0.01660156, "router_z_loss_mlp": 0.12695312, "step": 5123, "time_per_iteration": 3.202012062072754 }, { "auxiliary_loss_clip": 0.01079796, "auxiliary_loss_mlp": 0.01032874, "balance_loss_clip": 1.01499224, "balance_loss_mlp": 1.02367806, "epoch": 0.3080715466706749, "flos": 24059101875840.0, "grad_norm": 2.1068432353962487, "language_loss": 0.83148974, "learning_rate": 3.1345503441812357e-06, "loss": 0.85261643, "num_input_tokens_seen": 110034865, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.5625, "step": 5124, "time_per_iteration": 2.4288156032562256 }, { "auxiliary_loss_clip": 0.01082191, "auxiliary_loss_mlp": 0.01031345, "balance_loss_clip": 1.01534677, "balance_loss_mlp": 1.02525115, "epoch": 0.30813166992334284, "flos": 25847492826240.0, "grad_norm": 1.8736936207917514, "language_loss": 0.79029876, "learning_rate": 3.1342391765552032e-06, "loss": 0.81143409, "num_input_tokens_seen": 110052930, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5703125, "step": 5125, "time_per_iteration": 2.4141275882720947 }, { "auxiliary_loss_clip": 0.01081219, "auxiliary_loss_mlp": 0.01035264, "balance_loss_clip": 1.01808608, "balance_loss_mlp": 1.02440333, "epoch": 0.3081917931760108, "flos": 20448558823680.0, "grad_norm": 1.8872369700136113, "language_loss": 0.64191848, "learning_rate": 3.1339279684510916e-06, "loss": 0.66308331, "num_input_tokens_seen": 110071765, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.5703125, "step": 5126, "time_per_iteration": 2.366394519805908 }, { "auxiliary_loss_clip": 0.01077885, "auxiliary_loss_mlp": 0.0103064, "balance_loss_clip": 1.0156908, "balance_loss_mlp": 1.02465594, "epoch": 0.3082519164286788, "flos": 22165098462720.0, "grad_norm": 2.7576994911302997, "language_loss": 0.86689198, "learning_rate": 3.1336167198800072e-06, "loss": 0.88797724, "num_input_tokens_seen": 110092660, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.53125, "step": 5127, "time_per_iteration": 2.3918304443359375 }, { "auxiliary_loss_clip": 0.01077635, "auxiliary_loss_mlp": 0.01035878, "balance_loss_clip": 1.01923609, "balance_loss_mlp": 1.02379274, "epoch": 0.30831203968134674, "flos": 28912133859840.0, "grad_norm": 1.92067605870235, "language_loss": 0.68613291, "learning_rate": 3.133305430853059e-06, "loss": 0.707268, "num_input_tokens_seen": 110114960, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.5390625, "step": 5128, "time_per_iteration": 2.4436962604522705 }, { "auxiliary_loss_clip": 0.01080492, "auxiliary_loss_mlp": 0.0103474, "balance_loss_clip": 1.01841998, "balance_loss_mlp": 1.02417898, "epoch": 0.3083721629340147, "flos": 25666503004800.0, "grad_norm": 1.751935711431384, "language_loss": 0.71321845, "learning_rate": 3.132994101381354e-06, "loss": 0.73437083, "num_input_tokens_seen": 110135750, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.5625, "step": 5129, "time_per_iteration": 2.4182329177856445 }, { "auxiliary_loss_clip": 0.01017501, "auxiliary_loss_mlp": 0.0100063, "balance_loss_clip": 0.99912214, "balance_loss_mlp": 1.00459743, "epoch": 0.3084322861866827, "flos": 68209181763840.0, "grad_norm": 0.8376185662338888, "language_loss": 0.59226048, "learning_rate": 3.132682731476005e-06, "loss": 0.61244178, "num_input_tokens_seen": 110189480, "router_z_loss_clip": 0.01507568, "router_z_loss_mlp": 0.12890625, "step": 5130, "time_per_iteration": 2.9858717918395996 }, { "auxiliary_loss_clip": 0.01080878, "auxiliary_loss_mlp": 0.01033931, "balance_loss_clip": 1.01668143, "balance_loss_mlp": 1.02439547, "epoch": 0.3084924094393507, "flos": 20295639601920.0, "grad_norm": 2.6875767153783348, "language_loss": 0.72677922, "learning_rate": 3.1323713211481227e-06, "loss": 0.74792731, "num_input_tokens_seen": 110206445, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.5625, "step": 5131, "time_per_iteration": 2.373452663421631 }, { "auxiliary_loss_clip": 0.01079323, "auxiliary_loss_mlp": 0.01030824, "balance_loss_clip": 1.01579165, "balance_loss_mlp": 1.02553153, "epoch": 0.30855253269201866, "flos": 23948741468160.0, "grad_norm": 5.220404308141966, "language_loss": 0.71094596, "learning_rate": 3.1320598704088204e-06, "loss": 0.73204744, "num_input_tokens_seen": 110226845, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.5390625, "step": 5132, "time_per_iteration": 2.4195947647094727 }, { "auxiliary_loss_clip": 0.01076501, "auxiliary_loss_mlp": 0.01030856, "balance_loss_clip": 1.01643753, "balance_loss_mlp": 1.02521384, "epoch": 0.3086126559446866, "flos": 19280853757440.0, "grad_norm": 1.9599700961515136, "language_loss": 0.90192401, "learning_rate": 3.1317483792692136e-06, "loss": 0.92299753, "num_input_tokens_seen": 110244095, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.51171875, "step": 5133, "time_per_iteration": 2.385739326477051 }, { "auxiliary_loss_clip": 0.01081413, "auxiliary_loss_mlp": 0.01039873, "balance_loss_clip": 1.0222894, "balance_loss_mlp": 1.02509832, "epoch": 0.3086727791973546, "flos": 33759510203520.0, "grad_norm": 1.7427786961028637, "language_loss": 0.67729735, "learning_rate": 3.131436847740418e-06, "loss": 0.69851029, "num_input_tokens_seen": 110264240, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.5625, "step": 5134, "time_per_iteration": 2.5005271434783936 }, { "auxiliary_loss_clip": 0.01082007, "auxiliary_loss_mlp": 0.01029308, "balance_loss_clip": 1.01270247, "balance_loss_mlp": 1.02506828, "epoch": 0.30873290245002255, "flos": 16033232954880.0, "grad_norm": 1.975231147235075, "language_loss": 0.82983732, "learning_rate": 3.1311252758335523e-06, "loss": 0.85095048, "num_input_tokens_seen": 110282450, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.5703125, "step": 5135, "time_per_iteration": 2.3567006587982178 }, { "auxiliary_loss_clip": 0.0101515, "auxiliary_loss_mlp": 0.01000613, "balance_loss_clip": 0.99915224, "balance_loss_mlp": 1.00262022, "epoch": 0.3087930257026905, "flos": 65044618819200.0, "grad_norm": 0.7082331396063813, "language_loss": 0.5525611, "learning_rate": 3.130813663559735e-06, "loss": 0.57271874, "num_input_tokens_seen": 110343715, "router_z_loss_clip": 0.0145874, "router_z_loss_mlp": 0.125, "step": 5136, "time_per_iteration": 3.045793056488037 }, { "auxiliary_loss_clip": 0.01078779, "auxiliary_loss_mlp": 0.01035155, "balance_loss_clip": 1.01882315, "balance_loss_mlp": 1.02354622, "epoch": 0.3088531489553585, "flos": 74736301173120.0, "grad_norm": 2.646118738623355, "language_loss": 0.7619133, "learning_rate": 3.130502010930087e-06, "loss": 0.78305262, "num_input_tokens_seen": 110368430, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.5546875, "step": 5137, "time_per_iteration": 2.768284797668457 }, { "auxiliary_loss_clip": 0.01077417, "auxiliary_loss_mlp": 0.01029081, "balance_loss_clip": 1.01478219, "balance_loss_mlp": 1.02478099, "epoch": 0.30891327220802645, "flos": 21141235664640.0, "grad_norm": 1.9320321602761175, "language_loss": 0.79881501, "learning_rate": 3.1301903179557293e-06, "loss": 0.81988001, "num_input_tokens_seen": 110386735, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.52734375, "step": 5138, "time_per_iteration": 2.3786003589630127 }, { "auxiliary_loss_clip": 0.01078554, "auxiliary_loss_mlp": 0.01027531, "balance_loss_clip": 1.01010287, "balance_loss_mlp": 1.0240829, "epoch": 0.3089733954606944, "flos": 25663360982400.0, "grad_norm": 1.844342245454365, "language_loss": 0.81375891, "learning_rate": 3.1298785846477868e-06, "loss": 0.83481979, "num_input_tokens_seen": 110406820, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.546875, "step": 5139, "time_per_iteration": 2.419050693511963 }, { "auxiliary_loss_clip": 0.01083194, "auxiliary_loss_mlp": 0.01034558, "balance_loss_clip": 1.0163784, "balance_loss_mlp": 1.02489758, "epoch": 0.3090335187133624, "flos": 19426336859520.0, "grad_norm": 2.0333308420387244, "language_loss": 0.77431548, "learning_rate": 3.129566811017384e-06, "loss": 0.79549295, "num_input_tokens_seen": 110424225, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.58203125, "step": 5140, "time_per_iteration": 2.3688924312591553 }, { "auxiliary_loss_clip": 0.0107667, "auxiliary_loss_mlp": 0.01033337, "balance_loss_clip": 1.01797056, "balance_loss_mlp": 1.02471864, "epoch": 0.30909364196603034, "flos": 20010294126720.0, "grad_norm": 1.6614967973781085, "language_loss": 0.78510714, "learning_rate": 3.1292549970756476e-06, "loss": 0.80620718, "num_input_tokens_seen": 110443310, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.51953125, "step": 5141, "time_per_iteration": 2.3845062255859375 }, { "auxiliary_loss_clip": 0.01080276, "auxiliary_loss_mlp": 0.01030535, "balance_loss_clip": 1.01422119, "balance_loss_mlp": 1.02444828, "epoch": 0.3091537652186983, "flos": 19676699285760.0, "grad_norm": 2.058116365027092, "language_loss": 0.86921352, "learning_rate": 3.128943142833705e-06, "loss": 0.89032161, "num_input_tokens_seen": 110460215, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.55859375, "step": 5142, "time_per_iteration": 2.3750782012939453 }, { "auxiliary_loss_clip": 0.01079277, "auxiliary_loss_mlp": 0.01036248, "balance_loss_clip": 1.01983297, "balance_loss_mlp": 1.02361512, "epoch": 0.3092138884713663, "flos": 17019075415680.0, "grad_norm": 1.8896357536378163, "language_loss": 0.78801435, "learning_rate": 3.128631248302686e-06, "loss": 0.80916965, "num_input_tokens_seen": 110479385, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.5546875, "step": 5143, "time_per_iteration": 2.369886636734009 }, { "auxiliary_loss_clip": 0.01077859, "auxiliary_loss_mlp": 0.01033591, "balance_loss_clip": 1.01674676, "balance_loss_mlp": 1.02326035, "epoch": 0.3092740117240343, "flos": 25008809212800.0, "grad_norm": 1.7947530757302053, "language_loss": 0.72192693, "learning_rate": 3.12831931349372e-06, "loss": 0.7430414, "num_input_tokens_seen": 110499885, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.546875, "step": 5144, "time_per_iteration": 2.4250173568725586 }, { "auxiliary_loss_clip": 0.01080226, "auxiliary_loss_mlp": 0.01035091, "balance_loss_clip": 1.01883709, "balance_loss_mlp": 1.02430296, "epoch": 0.30933413497670226, "flos": 25589310255360.0, "grad_norm": 5.496147825587017, "language_loss": 0.73840511, "learning_rate": 3.128007338417941e-06, "loss": 0.75955832, "num_input_tokens_seen": 110519690, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.55859375, "step": 5145, "time_per_iteration": 2.4251694679260254 }, { "auxiliary_loss_clip": 0.01080089, "auxiliary_loss_mlp": 0.01036067, "balance_loss_clip": 1.01983047, "balance_loss_mlp": 1.02473593, "epoch": 0.3093942582293702, "flos": 24388507353600.0, "grad_norm": 1.7758187332346287, "language_loss": 0.75967741, "learning_rate": 3.127695323086481e-06, "loss": 0.78083897, "num_input_tokens_seen": 110540520, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.5546875, "step": 5146, "time_per_iteration": 2.4361259937286377 }, { "auxiliary_loss_clip": 0.01076616, "auxiliary_loss_mlp": 0.0102819, "balance_loss_clip": 1.01258004, "balance_loss_mlp": 1.02296495, "epoch": 0.3094543814820382, "flos": 19645416840960.0, "grad_norm": 1.8186994298447199, "language_loss": 0.66443276, "learning_rate": 3.1273832675104766e-06, "loss": 0.68548083, "num_input_tokens_seen": 110557950, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.53515625, "step": 5147, "time_per_iteration": 2.3907363414764404 }, { "auxiliary_loss_clip": 0.01081732, "auxiliary_loss_mlp": 0.01035762, "balance_loss_clip": 1.02046752, "balance_loss_mlp": 1.02510178, "epoch": 0.30951450473470615, "flos": 25662697666560.0, "grad_norm": 1.750283828070761, "language_loss": 0.74403429, "learning_rate": 3.1270711717010623e-06, "loss": 0.76520926, "num_input_tokens_seen": 110578215, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.56640625, "step": 5148, "time_per_iteration": 2.4220266342163086 }, { "auxiliary_loss_clip": 0.01084041, "auxiliary_loss_mlp": 0.01040437, "balance_loss_clip": 1.02317524, "balance_loss_mlp": 1.02491403, "epoch": 0.3095746279873741, "flos": 12019617722880.0, "grad_norm": 6.035270152315335, "language_loss": 0.72642064, "learning_rate": 3.126759035669378e-06, "loss": 0.74766546, "num_input_tokens_seen": 110592990, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.59375, "step": 5149, "time_per_iteration": 2.322463035583496 }, { "auxiliary_loss_clip": 0.01079395, "auxiliary_loss_mlp": 0.01039494, "balance_loss_clip": 1.02232814, "balance_loss_mlp": 1.02319908, "epoch": 0.3096347512400421, "flos": 23621919431040.0, "grad_norm": 1.7128054683454688, "language_loss": 0.84999681, "learning_rate": 3.1264468594265612e-06, "loss": 0.87118566, "num_input_tokens_seen": 110612130, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.5625, "step": 5150, "time_per_iteration": 2.3952243328094482 }, { "auxiliary_loss_clip": 0.0107901, "auxiliary_loss_mlp": 0.01035461, "balance_loss_clip": 1.01827121, "balance_loss_mlp": 1.02312183, "epoch": 0.30969487449271005, "flos": 22528195269120.0, "grad_norm": 1.8247627657633652, "language_loss": 0.78886694, "learning_rate": 3.126134642983754e-06, "loss": 0.81001163, "num_input_tokens_seen": 110632045, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.55859375, "step": 5151, "time_per_iteration": 3.798525333404541 }, { "auxiliary_loss_clip": 0.01082234, "auxiliary_loss_mlp": 0.01036797, "balance_loss_clip": 1.02034593, "balance_loss_mlp": 1.02549887, "epoch": 0.309754997745378, "flos": 15267029057280.0, "grad_norm": 1.886276959709869, "language_loss": 0.66999632, "learning_rate": 3.125822386352098e-06, "loss": 0.69118667, "num_input_tokens_seen": 110649340, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.56640625, "step": 5152, "time_per_iteration": 2.3564295768737793 }, { "auxiliary_loss_clip": 0.01078455, "auxiliary_loss_mlp": 0.01030742, "balance_loss_clip": 1.01430285, "balance_loss_mlp": 1.02357638, "epoch": 0.309815120998046, "flos": 26978085250560.0, "grad_norm": 2.0609214740170243, "language_loss": 0.82165974, "learning_rate": 3.1255100895427373e-06, "loss": 0.84275174, "num_input_tokens_seen": 110668450, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.546875, "step": 5153, "time_per_iteration": 2.414320468902588 }, { "auxiliary_loss_clip": 0.01081617, "auxiliary_loss_mlp": 0.0103522, "balance_loss_clip": 1.01829267, "balance_loss_mlp": 1.02683783, "epoch": 0.30987524425071394, "flos": 21142073537280.0, "grad_norm": 1.7577864193391401, "language_loss": 0.73961419, "learning_rate": 3.1251977525668167e-06, "loss": 0.76078254, "num_input_tokens_seen": 110689410, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.546875, "step": 5154, "time_per_iteration": 3.8006789684295654 }, { "auxiliary_loss_clip": 0.01080085, "auxiliary_loss_mlp": 0.01029341, "balance_loss_clip": 1.01333094, "balance_loss_mlp": 1.02546859, "epoch": 0.3099353675033819, "flos": 15267378170880.0, "grad_norm": 2.0899740514145404, "language_loss": 0.75891566, "learning_rate": 3.1248853754354824e-06, "loss": 0.78000993, "num_input_tokens_seen": 110707350, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.546875, "step": 5155, "time_per_iteration": 2.361358642578125 }, { "auxiliary_loss_clip": 0.01082867, "auxiliary_loss_mlp": 0.01032622, "balance_loss_clip": 1.01556349, "balance_loss_mlp": 1.02651596, "epoch": 0.30999549075604993, "flos": 15412896184320.0, "grad_norm": 1.7411115666002182, "language_loss": 0.78394848, "learning_rate": 3.1245729581598826e-06, "loss": 0.80510342, "num_input_tokens_seen": 110724910, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.5625, "step": 5156, "time_per_iteration": 3.7453432083129883 }, { "auxiliary_loss_clip": 0.01081073, "auxiliary_loss_mlp": 0.01038307, "balance_loss_clip": 1.02029407, "balance_loss_mlp": 1.02402627, "epoch": 0.3100556140087179, "flos": 23183445265920.0, "grad_norm": 2.132798793292484, "language_loss": 0.75284863, "learning_rate": 3.1242605007511664e-06, "loss": 0.77404249, "num_input_tokens_seen": 110744010, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.5703125, "step": 5157, "time_per_iteration": 2.4060237407684326 }, { "auxiliary_loss_clip": 0.01078916, "auxiliary_loss_mlp": 0.01032459, "balance_loss_clip": 1.01668811, "balance_loss_mlp": 1.02377439, "epoch": 0.31011573726138586, "flos": 25740902845440.0, "grad_norm": 1.5550964610521982, "language_loss": 0.69124174, "learning_rate": 3.1239480032204857e-06, "loss": 0.71235549, "num_input_tokens_seen": 110765835, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.55078125, "step": 5158, "time_per_iteration": 2.426074504852295 }, { "auxiliary_loss_clip": 0.01076075, "auxiliary_loss_mlp": 0.01032981, "balance_loss_clip": 1.01689982, "balance_loss_mlp": 1.02291238, "epoch": 0.3101758605140538, "flos": 20010294126720.0, "grad_norm": 2.0654928431009396, "language_loss": 0.85362601, "learning_rate": 3.123635465578991e-06, "loss": 0.87471652, "num_input_tokens_seen": 110784655, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.53125, "step": 5159, "time_per_iteration": 2.3895745277404785 }, { "auxiliary_loss_clip": 0.01078274, "auxiliary_loss_mlp": 0.01030762, "balance_loss_clip": 1.01403642, "balance_loss_mlp": 1.02348506, "epoch": 0.3102359837667218, "flos": 19134672428160.0, "grad_norm": 2.5436504189724385, "language_loss": 0.84694105, "learning_rate": 3.123322887837837e-06, "loss": 0.86803138, "num_input_tokens_seen": 110802545, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.546875, "step": 5160, "time_per_iteration": 3.718170642852783 }, { "auxiliary_loss_clip": 0.01079781, "auxiliary_loss_mlp": 0.01033897, "balance_loss_clip": 1.01805365, "balance_loss_mlp": 1.02562118, "epoch": 0.31029610701938976, "flos": 22264531614720.0, "grad_norm": 4.096411625206555, "language_loss": 0.7556901, "learning_rate": 3.123010270008179e-06, "loss": 0.77682686, "num_input_tokens_seen": 110820265, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.5390625, "step": 5161, "time_per_iteration": 2.3770389556884766 }, { "auxiliary_loss_clip": 0.01080668, "auxiliary_loss_mlp": 0.01036132, "balance_loss_clip": 1.01968145, "balance_loss_mlp": 1.02384794, "epoch": 0.3103562302720577, "flos": 20804533712640.0, "grad_norm": 2.2722921416374873, "language_loss": 0.81461251, "learning_rate": 3.1226976121011734e-06, "loss": 0.8357805, "num_input_tokens_seen": 110836195, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.5703125, "step": 5162, "time_per_iteration": 2.3431122303009033 }, { "auxiliary_loss_clip": 0.01076793, "auxiliary_loss_mlp": 0.01031223, "balance_loss_clip": 1.0169183, "balance_loss_mlp": 1.02361786, "epoch": 0.3104163535247257, "flos": 22343120818560.0, "grad_norm": 1.6402388280776106, "language_loss": 0.82766771, "learning_rate": 3.1223849141279774e-06, "loss": 0.84874785, "num_input_tokens_seen": 110856420, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.53125, "step": 5163, "time_per_iteration": 2.387047529220581 }, { "auxiliary_loss_clip": 0.01081891, "auxiliary_loss_mlp": 0.01036125, "balance_loss_clip": 1.01897073, "balance_loss_mlp": 1.02526426, "epoch": 0.31047647677739365, "flos": 21688289758080.0, "grad_norm": 2.5654516636234286, "language_loss": 0.7619108, "learning_rate": 3.1220721760997517e-06, "loss": 0.78309095, "num_input_tokens_seen": 110876650, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.56640625, "step": 5164, "time_per_iteration": 2.3888611793518066 }, { "auxiliary_loss_clip": 0.01082237, "auxiliary_loss_mlp": 0.01034542, "balance_loss_clip": 1.01800752, "balance_loss_mlp": 1.02623677, "epoch": 0.3105366000300616, "flos": 18916255762560.0, "grad_norm": 2.100125460507144, "language_loss": 0.74655926, "learning_rate": 3.1217593980276554e-06, "loss": 0.76772702, "num_input_tokens_seen": 110894445, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.5625, "step": 5165, "time_per_iteration": 2.375763177871704 }, { "auxiliary_loss_clip": 0.01077283, "auxiliary_loss_mlp": 0.01031538, "balance_loss_clip": 1.01592183, "balance_loss_mlp": 1.02380311, "epoch": 0.3105967232827296, "flos": 18259399843200.0, "grad_norm": 1.4971185350030323, "language_loss": 0.75919765, "learning_rate": 3.1214465799228525e-06, "loss": 0.78028589, "num_input_tokens_seen": 110912855, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.53515625, "step": 5166, "time_per_iteration": 2.363426685333252 }, { "auxiliary_loss_clip": 0.01079001, "auxiliary_loss_mlp": 0.01035679, "balance_loss_clip": 1.01888227, "balance_loss_mlp": 1.02397704, "epoch": 0.31065684653539755, "flos": 17671288124160.0, "grad_norm": 2.2446587201714228, "language_loss": 0.73668718, "learning_rate": 3.121133721796505e-06, "loss": 0.75783396, "num_input_tokens_seen": 110928025, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.55078125, "step": 5167, "time_per_iteration": 2.340365409851074 }, { "auxiliary_loss_clip": 0.01014494, "auxiliary_loss_mlp": 0.01002894, "balance_loss_clip": 1.00134408, "balance_loss_mlp": 1.00219703, "epoch": 0.3107169697880655, "flos": 68528742238080.0, "grad_norm": 0.7114608222550712, "language_loss": 0.52947611, "learning_rate": 3.1208208236597795e-06, "loss": 0.54965001, "num_input_tokens_seen": 110992215, "router_z_loss_clip": 0.01544189, "router_z_loss_mlp": 0.12304688, "step": 5168, "time_per_iteration": 3.0846521854400635 }, { "auxiliary_loss_clip": 0.01081284, "auxiliary_loss_mlp": 0.01031618, "balance_loss_clip": 1.01458335, "balance_loss_mlp": 1.0247972, "epoch": 0.3107770930407335, "flos": 13187881370880.0, "grad_norm": 31.9167680073535, "language_loss": 0.786448, "learning_rate": 3.1205078855238417e-06, "loss": 0.80757707, "num_input_tokens_seen": 111010400, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.5625, "step": 5169, "time_per_iteration": 2.398637056350708 }, { "auxiliary_loss_clip": 0.0107965, "auxiliary_loss_mlp": 0.01029861, "balance_loss_clip": 1.01420891, "balance_loss_mlp": 1.02494442, "epoch": 0.3108372162934015, "flos": 31579393265280.0, "grad_norm": 1.5191279813242082, "language_loss": 0.64471245, "learning_rate": 3.12019490739986e-06, "loss": 0.66580755, "num_input_tokens_seen": 111033960, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.546875, "step": 5170, "time_per_iteration": 2.4840941429138184 }, { "auxiliary_loss_clip": 0.01083673, "auxiliary_loss_mlp": 0.01035095, "balance_loss_clip": 1.01763117, "balance_loss_mlp": 1.02708578, "epoch": 0.31089733954606946, "flos": 28728595509120.0, "grad_norm": 3.0215369841663513, "language_loss": 0.77810049, "learning_rate": 3.1198818892990037e-06, "loss": 0.79928815, "num_input_tokens_seen": 111053265, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.56640625, "step": 5171, "time_per_iteration": 2.4291341304779053 }, { "auxiliary_loss_clip": 0.01082376, "auxiliary_loss_mlp": 0.01034013, "balance_loss_clip": 1.01727653, "balance_loss_mlp": 1.02589393, "epoch": 0.3109574627987374, "flos": 19682215280640.0, "grad_norm": 1.932997151229255, "language_loss": 0.83597481, "learning_rate": 3.1195688312324426e-06, "loss": 0.85713863, "num_input_tokens_seen": 111071130, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.5625, "step": 5172, "time_per_iteration": 2.390674352645874 }, { "auxiliary_loss_clip": 0.01081184, "auxiliary_loss_mlp": 0.01036426, "balance_loss_clip": 1.01885414, "balance_loss_mlp": 1.02526832, "epoch": 0.3110175860514054, "flos": 14683106701440.0, "grad_norm": 2.085447486912353, "language_loss": 0.83641905, "learning_rate": 3.11925573321135e-06, "loss": 0.85759509, "num_input_tokens_seen": 111089560, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.55859375, "step": 5173, "time_per_iteration": 2.370365619659424 }, { "auxiliary_loss_clip": 0.01077795, "auxiliary_loss_mlp": 0.01030433, "balance_loss_clip": 1.01478052, "balance_loss_mlp": 1.0254494, "epoch": 0.31107770930407336, "flos": 25738459050240.0, "grad_norm": 2.2856778103464555, "language_loss": 0.83201587, "learning_rate": 3.1189425952469003e-06, "loss": 0.85309815, "num_input_tokens_seen": 111109960, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.5234375, "step": 5174, "time_per_iteration": 2.4165408611297607 }, { "auxiliary_loss_clip": 0.01080246, "auxiliary_loss_mlp": 0.01032643, "balance_loss_clip": 1.01610303, "balance_loss_mlp": 1.02588403, "epoch": 0.3111378325567413, "flos": 19207256878080.0, "grad_norm": 2.5154940620519377, "language_loss": 0.85075682, "learning_rate": 3.1186294173502667e-06, "loss": 0.87188578, "num_input_tokens_seen": 111127960, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.54296875, "step": 5175, "time_per_iteration": 2.373990535736084 }, { "auxiliary_loss_clip": 0.01081859, "auxiliary_loss_mlp": 0.0103772, "balance_loss_clip": 1.02126956, "balance_loss_mlp": 1.02669513, "epoch": 0.3111979558094093, "flos": 23695237019520.0, "grad_norm": 1.582127669238255, "language_loss": 0.83314329, "learning_rate": 3.118316199532627e-06, "loss": 0.85433906, "num_input_tokens_seen": 111146730, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.55078125, "step": 5176, "time_per_iteration": 2.394024610519409 }, { "auxiliary_loss_clip": 0.01077379, "auxiliary_loss_mlp": 0.01031054, "balance_loss_clip": 1.01453185, "balance_loss_mlp": 1.02340221, "epoch": 0.31125807906207725, "flos": 21031957509120.0, "grad_norm": 2.0081556761543493, "language_loss": 0.80177754, "learning_rate": 3.1180029418051586e-06, "loss": 0.82286185, "num_input_tokens_seen": 111166295, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.5390625, "step": 5177, "time_per_iteration": 2.380166530609131 }, { "auxiliary_loss_clip": 0.01080185, "auxiliary_loss_mlp": 0.01033765, "balance_loss_clip": 1.01688457, "balance_loss_mlp": 1.0258652, "epoch": 0.3113182023147452, "flos": 23075493742080.0, "grad_norm": 1.7207153332317755, "language_loss": 0.80549353, "learning_rate": 3.117689644179041e-06, "loss": 0.82663304, "num_input_tokens_seen": 111185665, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.54296875, "step": 5178, "time_per_iteration": 2.387977361679077 }, { "auxiliary_loss_clip": 0.01081592, "auxiliary_loss_mlp": 0.01033627, "balance_loss_clip": 1.01745057, "balance_loss_mlp": 1.02485847, "epoch": 0.3113783255674132, "flos": 11838174053760.0, "grad_norm": 1.7749807277252454, "language_loss": 0.81721008, "learning_rate": 3.1173763066654556e-06, "loss": 0.83836234, "num_input_tokens_seen": 111201615, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.5703125, "step": 5179, "time_per_iteration": 2.371904134750366 }, { "auxiliary_loss_clip": 0.0108212, "auxiliary_loss_mlp": 0.01040558, "balance_loss_clip": 1.02355909, "balance_loss_mlp": 1.02723241, "epoch": 0.31143844882008115, "flos": 16288622616960.0, "grad_norm": 1.6678175791675147, "language_loss": 0.78229654, "learning_rate": 3.1170629292755837e-06, "loss": 0.80352336, "num_input_tokens_seen": 111220515, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.546875, "step": 5180, "time_per_iteration": 2.3581440448760986 }, { "auxiliary_loss_clip": 0.01079154, "auxiliary_loss_mlp": 0.01028954, "balance_loss_clip": 1.01297402, "balance_loss_mlp": 1.02463138, "epoch": 0.3114985720727491, "flos": 23216787480960.0, "grad_norm": 1.716053471694723, "language_loss": 0.8308934, "learning_rate": 3.1167495120206094e-06, "loss": 0.85197449, "num_input_tokens_seen": 111240395, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.546875, "step": 5181, "time_per_iteration": 2.4131224155426025 }, { "auxiliary_loss_clip": 0.01075548, "auxiliary_loss_mlp": 0.01032107, "balance_loss_clip": 1.01744437, "balance_loss_mlp": 1.02359533, "epoch": 0.3115586953254171, "flos": 30043319777280.0, "grad_norm": 3.410956960308437, "language_loss": 0.74611485, "learning_rate": 3.116436054911717e-06, "loss": 0.76719141, "num_input_tokens_seen": 111261100, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.515625, "step": 5182, "time_per_iteration": 2.4406213760375977 }, { "auxiliary_loss_clip": 0.01082012, "auxiliary_loss_mlp": 0.01044019, "balance_loss_clip": 1.02663863, "balance_loss_mlp": 1.02548504, "epoch": 0.3116188185780851, "flos": 25665141461760.0, "grad_norm": 1.827078533865724, "language_loss": 0.70743579, "learning_rate": 3.116122557960094e-06, "loss": 0.72869617, "num_input_tokens_seen": 111281320, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.56640625, "step": 5183, "time_per_iteration": 2.428618907928467 }, { "auxiliary_loss_clip": 0.0101554, "auxiliary_loss_mlp": 0.01005353, "balance_loss_clip": 1.00384498, "balance_loss_mlp": 1.0032835, "epoch": 0.31167894183075306, "flos": 69506974022400.0, "grad_norm": 1.1213421774254597, "language_loss": 0.59617829, "learning_rate": 3.115809021176928e-06, "loss": 0.61638725, "num_input_tokens_seen": 111341405, "router_z_loss_clip": 0.01507568, "router_z_loss_mlp": 0.12255859, "step": 5184, "time_per_iteration": 3.0369694232940674 }, { "auxiliary_loss_clip": 0.01076911, "auxiliary_loss_mlp": 0.01033431, "balance_loss_clip": 1.01824319, "balance_loss_mlp": 1.02336717, "epoch": 0.31173906508342103, "flos": 14938950211200.0, "grad_norm": 2.0188060636368292, "language_loss": 0.7018702, "learning_rate": 3.1154954445734088e-06, "loss": 0.7229737, "num_input_tokens_seen": 111358975, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.53515625, "step": 5185, "time_per_iteration": 2.36911678314209 }, { "auxiliary_loss_clip": 0.01081712, "auxiliary_loss_mlp": 0.01034833, "balance_loss_clip": 1.01922858, "balance_loss_mlp": 1.02561331, "epoch": 0.311799188336089, "flos": 16175224920960.0, "grad_norm": 2.3242265680003027, "language_loss": 0.63079411, "learning_rate": 3.115181828160726e-06, "loss": 0.65195954, "num_input_tokens_seen": 111375845, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.5625, "step": 5186, "time_per_iteration": 2.345111131668091 }, { "auxiliary_loss_clip": 0.01083555, "auxiliary_loss_mlp": 0.01038992, "balance_loss_clip": 1.02171898, "balance_loss_mlp": 1.02592337, "epoch": 0.31185931158875696, "flos": 25008460099200.0, "grad_norm": 2.375239376716589, "language_loss": 0.86922002, "learning_rate": 3.1148681719500723e-06, "loss": 0.89044547, "num_input_tokens_seen": 111394150, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.57421875, "step": 5187, "time_per_iteration": 2.4081404209136963 }, { "auxiliary_loss_clip": 0.01078339, "auxiliary_loss_mlp": 0.01036552, "balance_loss_clip": 1.02109027, "balance_loss_mlp": 1.02306581, "epoch": 0.3119194348414249, "flos": 37231377868800.0, "grad_norm": 1.5291252460434375, "language_loss": 0.62894654, "learning_rate": 3.114554475952642e-06, "loss": 0.65009546, "num_input_tokens_seen": 111418355, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.5546875, "step": 5188, "time_per_iteration": 2.513761043548584 }, { "auxiliary_loss_clip": 0.01082265, "auxiliary_loss_mlp": 0.01035234, "balance_loss_clip": 1.01893842, "balance_loss_mlp": 1.027493, "epoch": 0.3119795580940929, "flos": 15011883774720.0, "grad_norm": 2.1436139170398505, "language_loss": 0.8322295, "learning_rate": 3.1142407401796283e-06, "loss": 0.8534044, "num_input_tokens_seen": 111435445, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.546875, "step": 5189, "time_per_iteration": 2.3524062633514404 }, { "auxiliary_loss_clip": 0.0107796, "auxiliary_loss_mlp": 0.0102794, "balance_loss_clip": 1.01183438, "balance_loss_mlp": 1.02331614, "epoch": 0.31203968134676086, "flos": 15997237476480.0, "grad_norm": 2.0552027135418753, "language_loss": 0.78954196, "learning_rate": 3.113926964642229e-06, "loss": 0.810601, "num_input_tokens_seen": 111453430, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.546875, "step": 5190, "time_per_iteration": 2.3690733909606934 }, { "auxiliary_loss_clip": 0.01081006, "auxiliary_loss_mlp": 0.010284, "balance_loss_clip": 1.01239634, "balance_loss_mlp": 1.02578008, "epoch": 0.3120998045994288, "flos": 23836356201600.0, "grad_norm": 1.7876105766162835, "language_loss": 0.75349545, "learning_rate": 3.1136131493516426e-06, "loss": 0.77458954, "num_input_tokens_seen": 111475325, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.55078125, "step": 5191, "time_per_iteration": 3.8508388996124268 }, { "auxiliary_loss_clip": 0.01014703, "auxiliary_loss_mlp": 0.01001387, "balance_loss_clip": 0.9999088, "balance_loss_mlp": 1.0020169, "epoch": 0.3121599278520968, "flos": 69181059680640.0, "grad_norm": 0.8485211279840035, "language_loss": 0.63893914, "learning_rate": 3.1132992943190664e-06, "loss": 0.65910006, "num_input_tokens_seen": 111533960, "router_z_loss_clip": 0.01477051, "router_z_loss_mlp": 0.12695312, "step": 5192, "time_per_iteration": 3.085897207260132 }, { "auxiliary_loss_clip": 0.01079491, "auxiliary_loss_mlp": 0.01030856, "balance_loss_clip": 1.01410747, "balance_loss_mlp": 1.02389359, "epoch": 0.31222005110476475, "flos": 23805213402240.0, "grad_norm": 1.5507979407197972, "language_loss": 0.79747391, "learning_rate": 3.1129853995557033e-06, "loss": 0.81857741, "num_input_tokens_seen": 111554055, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.5546875, "step": 5193, "time_per_iteration": 2.4205949306488037 }, { "auxiliary_loss_clip": 0.01080126, "auxiliary_loss_mlp": 0.01031567, "balance_loss_clip": 1.01493776, "balance_loss_mlp": 1.02426255, "epoch": 0.3122801743574327, "flos": 25225026462720.0, "grad_norm": 1.9433182780917384, "language_loss": 0.72491264, "learning_rate": 3.1126714650727534e-06, "loss": 0.74602962, "num_input_tokens_seen": 111574305, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.55859375, "step": 5194, "time_per_iteration": 2.40321683883667 }, { "auxiliary_loss_clip": 0.01081824, "auxiliary_loss_mlp": 0.01031836, "balance_loss_clip": 1.015957, "balance_loss_mlp": 1.02660155, "epoch": 0.3123402976101007, "flos": 22965377713920.0, "grad_norm": 1.3980190269412873, "language_loss": 0.76536357, "learning_rate": 3.112357490881421e-06, "loss": 0.78650016, "num_input_tokens_seen": 111595680, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.55078125, "step": 5195, "time_per_iteration": 3.8176259994506836 }, { "auxiliary_loss_clip": 0.01077942, "auxiliary_loss_mlp": 0.01039451, "balance_loss_clip": 1.02213025, "balance_loss_mlp": 1.02252913, "epoch": 0.3124004208627687, "flos": 25190916197760.0, "grad_norm": 1.4224160415367675, "language_loss": 0.77692068, "learning_rate": 3.112043476992911e-06, "loss": 0.79809463, "num_input_tokens_seen": 111618135, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.5546875, "step": 5196, "time_per_iteration": 3.7637298107147217 }, { "auxiliary_loss_clip": 0.01013106, "auxiliary_loss_mlp": 0.01002574, "balance_loss_clip": 1.00095296, "balance_loss_mlp": 1.00082779, "epoch": 0.31246054411543667, "flos": 67481626608000.0, "grad_norm": 0.8141568095750946, "language_loss": 0.54867059, "learning_rate": 3.1117294234184304e-06, "loss": 0.56882739, "num_input_tokens_seen": 111682220, "router_z_loss_clip": 0.01623535, "router_z_loss_mlp": 0.12304688, "step": 5197, "time_per_iteration": 3.0648508071899414 }, { "auxiliary_loss_clip": 0.01079666, "auxiliary_loss_mlp": 0.01033759, "balance_loss_clip": 1.01822639, "balance_loss_mlp": 1.02567363, "epoch": 0.31252066736810463, "flos": 17857549560960.0, "grad_norm": 1.5630285299325366, "language_loss": 0.66702014, "learning_rate": 3.111415330169186e-06, "loss": 0.6881544, "num_input_tokens_seen": 111700815, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.5390625, "step": 5198, "time_per_iteration": 2.367238759994507 }, { "auxiliary_loss_clip": 0.01081461, "auxiliary_loss_mlp": 0.01029728, "balance_loss_clip": 1.01396847, "balance_loss_mlp": 1.02380633, "epoch": 0.3125807906207726, "flos": 18474150816000.0, "grad_norm": 1.9703807953105898, "language_loss": 0.69121277, "learning_rate": 3.111101197256387e-06, "loss": 0.71232462, "num_input_tokens_seen": 111718195, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.578125, "step": 5199, "time_per_iteration": 2.346186876296997 }, { "auxiliary_loss_clip": 0.01080707, "auxiliary_loss_mlp": 0.01037477, "balance_loss_clip": 1.02091897, "balance_loss_mlp": 1.0255847, "epoch": 0.31264091387344056, "flos": 18945722816640.0, "grad_norm": 1.7182015056405442, "language_loss": 0.78764206, "learning_rate": 3.110787024691245e-06, "loss": 0.80882394, "num_input_tokens_seen": 111734440, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.55078125, "step": 5200, "time_per_iteration": 3.7270419597625732 }, { "auxiliary_loss_clip": 0.01078461, "auxiliary_loss_mlp": 0.0102878, "balance_loss_clip": 1.01316404, "balance_loss_mlp": 1.02528775, "epoch": 0.3127010371261085, "flos": 21467499120000.0, "grad_norm": 1.9979917626069408, "language_loss": 0.83699179, "learning_rate": 3.1104728124849714e-06, "loss": 0.85806423, "num_input_tokens_seen": 111751960, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.53125, "step": 5201, "time_per_iteration": 2.3872454166412354 }, { "auxiliary_loss_clip": 0.01081885, "auxiliary_loss_mlp": 0.0103617, "balance_loss_clip": 1.01945066, "balance_loss_mlp": 1.02636862, "epoch": 0.3127611603787765, "flos": 15335284498560.0, "grad_norm": 2.0687619785515077, "language_loss": 0.6888448, "learning_rate": 3.110158560648779e-06, "loss": 0.71002531, "num_input_tokens_seen": 111769585, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.5546875, "step": 5202, "time_per_iteration": 2.368793487548828 }, { "auxiliary_loss_clip": 0.01082165, "auxiliary_loss_mlp": 0.0103053, "balance_loss_clip": 1.0154382, "balance_loss_mlp": 1.02622008, "epoch": 0.31282128363144446, "flos": 17602020253440.0, "grad_norm": 1.9734822106395633, "language_loss": 0.8388415, "learning_rate": 3.109844269193884e-06, "loss": 0.85996842, "num_input_tokens_seen": 111787880, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.55859375, "step": 5203, "time_per_iteration": 2.370553731918335 }, { "auxiliary_loss_clip": 0.01079535, "auxiliary_loss_mlp": 0.01029369, "balance_loss_clip": 1.01358581, "balance_loss_mlp": 1.02512097, "epoch": 0.3128814068841124, "flos": 26755653778560.0, "grad_norm": 2.27855496136418, "language_loss": 0.60890663, "learning_rate": 3.109529938131501e-06, "loss": 0.62999564, "num_input_tokens_seen": 111805950, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.54296875, "step": 5204, "time_per_iteration": 2.4368503093719482 }, { "auxiliary_loss_clip": 0.01078105, "auxiliary_loss_mlp": 0.01028143, "balance_loss_clip": 1.01350415, "balance_loss_mlp": 1.02503884, "epoch": 0.3129415301367804, "flos": 22271304418560.0, "grad_norm": 1.7547907188661906, "language_loss": 0.65913028, "learning_rate": 3.109215567472849e-06, "loss": 0.68019271, "num_input_tokens_seen": 111826135, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.53125, "step": 5205, "time_per_iteration": 2.3928170204162598 }, { "auxiliary_loss_clip": 0.01080375, "auxiliary_loss_mlp": 0.01033696, "balance_loss_clip": 1.01767445, "balance_loss_mlp": 1.02590215, "epoch": 0.31300165338944835, "flos": 26463814790400.0, "grad_norm": 1.5164144421712036, "language_loss": 0.76598847, "learning_rate": 3.1089011572291464e-06, "loss": 0.78712916, "num_input_tokens_seen": 111844700, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.546875, "step": 5206, "time_per_iteration": 2.4176676273345947 }, { "auxiliary_loss_clip": 0.01078435, "auxiliary_loss_mlp": 0.01028937, "balance_loss_clip": 1.01295102, "balance_loss_mlp": 1.02383256, "epoch": 0.3130617766421163, "flos": 21943574686080.0, "grad_norm": 2.53045503060247, "language_loss": 0.8274411, "learning_rate": 3.1085867074116143e-06, "loss": 0.8485148, "num_input_tokens_seen": 111861585, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.546875, "step": 5207, "time_per_iteration": 2.390138626098633 }, { "auxiliary_loss_clip": 0.01078352, "auxiliary_loss_mlp": 0.01029868, "balance_loss_clip": 1.01528835, "balance_loss_mlp": 1.02652049, "epoch": 0.3131218998947843, "flos": 23291710992000.0, "grad_norm": 1.5481103415319293, "language_loss": 0.71460927, "learning_rate": 3.108272218031475e-06, "loss": 0.73569143, "num_input_tokens_seen": 111882950, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.51953125, "step": 5208, "time_per_iteration": 2.401580572128296 }, { "auxiliary_loss_clip": 0.01082266, "auxiliary_loss_mlp": 0.01036293, "balance_loss_clip": 1.01954424, "balance_loss_mlp": 1.02604055, "epoch": 0.3131820231474523, "flos": 21138652224000.0, "grad_norm": 1.7093167898150907, "language_loss": 0.74616063, "learning_rate": 3.1079576890999498e-06, "loss": 0.76734626, "num_input_tokens_seen": 111901640, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.5625, "step": 5209, "time_per_iteration": 2.4222571849823 }, { "auxiliary_loss_clip": 0.01080924, "auxiliary_loss_mlp": 0.01033894, "balance_loss_clip": 1.01882601, "balance_loss_mlp": 1.02518535, "epoch": 0.31324214640012027, "flos": 23908870828800.0, "grad_norm": 1.6895503139107564, "language_loss": 0.77537382, "learning_rate": 3.107643120628265e-06, "loss": 0.79652196, "num_input_tokens_seen": 111919615, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.5546875, "step": 5210, "time_per_iteration": 2.397287368774414 }, { "auxiliary_loss_clip": 0.01075108, "auxiliary_loss_mlp": 0.01032293, "balance_loss_clip": 1.01727307, "balance_loss_mlp": 1.02325869, "epoch": 0.31330226965278823, "flos": 22235832610560.0, "grad_norm": 1.8560947599935154, "language_loss": 0.79085064, "learning_rate": 3.1073285126276467e-06, "loss": 0.8119247, "num_input_tokens_seen": 111938485, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.51953125, "step": 5211, "time_per_iteration": 2.3730459213256836 }, { "auxiliary_loss_clip": 0.01077211, "auxiliary_loss_mlp": 0.01031763, "balance_loss_clip": 1.01728463, "balance_loss_mlp": 1.02407885, "epoch": 0.3133623929054562, "flos": 19753019251200.0, "grad_norm": 1.9107587778144368, "language_loss": 0.79605746, "learning_rate": 3.1070138651093217e-06, "loss": 0.81714725, "num_input_tokens_seen": 111956425, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.53125, "step": 5212, "time_per_iteration": 2.385580539703369 }, { "auxiliary_loss_clip": 0.01081485, "auxiliary_loss_mlp": 0.0103855, "balance_loss_clip": 1.02183723, "balance_loss_mlp": 1.02487481, "epoch": 0.31342251615812416, "flos": 27161030108160.0, "grad_norm": 2.5929896746124643, "language_loss": 0.712565, "learning_rate": 3.10669917808452e-06, "loss": 0.73376536, "num_input_tokens_seen": 111975915, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.5625, "step": 5213, "time_per_iteration": 2.4182918071746826 }, { "auxiliary_loss_clip": 0.0108239, "auxiliary_loss_mlp": 0.01031634, "balance_loss_clip": 1.01552844, "balance_loss_mlp": 1.02738667, "epoch": 0.31348263941079213, "flos": 20228780615040.0, "grad_norm": 24.25431217186947, "language_loss": 0.77585387, "learning_rate": 3.106384451564471e-06, "loss": 0.79699421, "num_input_tokens_seen": 111995055, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.55078125, "step": 5214, "time_per_iteration": 2.3801310062408447 }, { "auxiliary_loss_clip": 0.01075621, "auxiliary_loss_mlp": 0.010259, "balance_loss_clip": 1.01164854, "balance_loss_mlp": 1.02380764, "epoch": 0.3135427626634601, "flos": 24606505082880.0, "grad_norm": 1.6536746125233315, "language_loss": 0.82472187, "learning_rate": 3.106069685560407e-06, "loss": 0.8457371, "num_input_tokens_seen": 112015830, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.515625, "step": 5215, "time_per_iteration": 2.410188674926758 }, { "auxiliary_loss_clip": 0.01079952, "auxiliary_loss_mlp": 0.01031824, "balance_loss_clip": 1.01605809, "balance_loss_mlp": 1.02497745, "epoch": 0.31360288591612806, "flos": 20958814477440.0, "grad_norm": 1.8239087292961935, "language_loss": 0.7913394, "learning_rate": 3.1057548800835613e-06, "loss": 0.81245714, "num_input_tokens_seen": 112035065, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.546875, "step": 5216, "time_per_iteration": 2.3760178089141846 }, { "auxiliary_loss_clip": 0.01077709, "auxiliary_loss_mlp": 0.01031736, "balance_loss_clip": 1.01504672, "balance_loss_mlp": 1.02292299, "epoch": 0.313663009168796, "flos": 26979272236800.0, "grad_norm": 1.6802899189431002, "language_loss": 0.68560529, "learning_rate": 3.105440035145168e-06, "loss": 0.70669973, "num_input_tokens_seen": 112058405, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.546875, "step": 5217, "time_per_iteration": 2.4405322074890137 }, { "auxiliary_loss_clip": 0.01081398, "auxiliary_loss_mlp": 0.01033088, "balance_loss_clip": 1.01719773, "balance_loss_mlp": 1.02496004, "epoch": 0.313723132421464, "flos": 18039935836800.0, "grad_norm": 1.585974064048292, "language_loss": 0.81060404, "learning_rate": 3.105125150756463e-06, "loss": 0.8317489, "num_input_tokens_seen": 112076420, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.5625, "step": 5218, "time_per_iteration": 2.403529167175293 }, { "auxiliary_loss_clip": 0.01081012, "auxiliary_loss_mlp": 0.0103759, "balance_loss_clip": 1.02019131, "balance_loss_mlp": 1.02569699, "epoch": 0.31378325567413196, "flos": 22487905693440.0, "grad_norm": 3.4544843904123232, "language_loss": 0.69337761, "learning_rate": 3.1048102269286843e-06, "loss": 0.71456367, "num_input_tokens_seen": 112090775, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.5546875, "step": 5219, "time_per_iteration": 2.359384059906006 }, { "auxiliary_loss_clip": 0.01078975, "auxiliary_loss_mlp": 0.01032719, "balance_loss_clip": 1.01668501, "balance_loss_mlp": 1.02443516, "epoch": 0.3138433789267999, "flos": 22418149063680.0, "grad_norm": 2.361456868397611, "language_loss": 0.79714119, "learning_rate": 3.104495263673071e-06, "loss": 0.81825817, "num_input_tokens_seen": 112110980, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.546875, "step": 5220, "time_per_iteration": 2.389528751373291 }, { "auxiliary_loss_clip": 0.01079666, "auxiliary_loss_mlp": 0.01028899, "balance_loss_clip": 1.0140636, "balance_loss_mlp": 1.02482939, "epoch": 0.3139035021794679, "flos": 13005076158720.0, "grad_norm": 1.7582443202352755, "language_loss": 0.73346162, "learning_rate": 3.1041802610008624e-06, "loss": 0.75454724, "num_input_tokens_seen": 112129020, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.546875, "step": 5221, "time_per_iteration": 2.342547655105591 }, { "auxiliary_loss_clip": 0.01076388, "auxiliary_loss_mlp": 0.01032688, "balance_loss_clip": 1.01732218, "balance_loss_mlp": 1.02358842, "epoch": 0.31396362543213585, "flos": 16945059600000.0, "grad_norm": 1.7344539838687305, "language_loss": 0.81519318, "learning_rate": 3.103865218923301e-06, "loss": 0.83628392, "num_input_tokens_seen": 112147865, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.52734375, "step": 5222, "time_per_iteration": 2.368912696838379 }, { "auxiliary_loss_clip": 0.01079858, "auxiliary_loss_mlp": 0.01036266, "balance_loss_clip": 1.01858759, "balance_loss_mlp": 1.0240047, "epoch": 0.31402374868480387, "flos": 20155707406080.0, "grad_norm": 2.371335870930761, "language_loss": 0.69601059, "learning_rate": 3.103550137451629e-06, "loss": 0.71717179, "num_input_tokens_seen": 112166745, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.55859375, "step": 5223, "time_per_iteration": 2.3521111011505127 }, { "auxiliary_loss_clip": 0.01076712, "auxiliary_loss_mlp": 0.01031703, "balance_loss_clip": 1.01721931, "balance_loss_mlp": 1.02363098, "epoch": 0.31408387193747184, "flos": 21250025061120.0, "grad_norm": 1.509765308651534, "language_loss": 0.80232835, "learning_rate": 3.1032350165970915e-06, "loss": 0.82341254, "num_input_tokens_seen": 112185895, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.53125, "step": 5224, "time_per_iteration": 2.3955767154693604 }, { "auxiliary_loss_clip": 0.01084675, "auxiliary_loss_mlp": 0.01035959, "balance_loss_clip": 1.01818466, "balance_loss_mlp": 1.02635026, "epoch": 0.3141439951901398, "flos": 27483208934400.0, "grad_norm": 2.628676699413002, "language_loss": 0.58446127, "learning_rate": 3.102919856370934e-06, "loss": 0.60566765, "num_input_tokens_seen": 112204465, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.58203125, "step": 5225, "time_per_iteration": 2.4336049556732178 }, { "auxiliary_loss_clip": 0.01073478, "auxiliary_loss_mlp": 0.01026267, "balance_loss_clip": 1.01224792, "balance_loss_mlp": 1.02312064, "epoch": 0.31420411844280777, "flos": 17851440072960.0, "grad_norm": 2.0848753295686944, "language_loss": 0.81684405, "learning_rate": 3.102604656784404e-06, "loss": 0.83784151, "num_input_tokens_seen": 112221635, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.50390625, "step": 5226, "time_per_iteration": 2.371347188949585 }, { "auxiliary_loss_clip": 0.01080596, "auxiliary_loss_mlp": 0.01034467, "balance_loss_clip": 1.01756275, "balance_loss_mlp": 1.025051, "epoch": 0.31426424169547573, "flos": 21615879864960.0, "grad_norm": 1.750586808766826, "language_loss": 0.74049574, "learning_rate": 3.10228941784875e-06, "loss": 0.76164633, "num_input_tokens_seen": 112241240, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.5546875, "step": 5227, "time_per_iteration": 2.3996050357818604 }, { "auxiliary_loss_clip": 0.01081925, "auxiliary_loss_mlp": 0.01034579, "balance_loss_clip": 1.01865268, "balance_loss_mlp": 1.0257802, "epoch": 0.3143243649481437, "flos": 30919290589440.0, "grad_norm": 1.8413185551654894, "language_loss": 0.6761961, "learning_rate": 3.101974139575222e-06, "loss": 0.69736111, "num_input_tokens_seen": 112262350, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.5625, "step": 5228, "time_per_iteration": 2.4649455547332764 }, { "auxiliary_loss_clip": 0.01078865, "auxiliary_loss_mlp": 0.01032788, "balance_loss_clip": 1.01746917, "balance_loss_mlp": 1.02348232, "epoch": 0.31438448820081166, "flos": 22820278636800.0, "grad_norm": 1.7984687215982043, "language_loss": 0.79878032, "learning_rate": 3.1016588219750716e-06, "loss": 0.81989688, "num_input_tokens_seen": 112283710, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.5546875, "step": 5229, "time_per_iteration": 2.399991750717163 }, { "auxiliary_loss_clip": 0.01080643, "auxiliary_loss_mlp": 0.01030776, "balance_loss_clip": 1.01443267, "balance_loss_mlp": 1.02655613, "epoch": 0.3144446114534796, "flos": 23291082587520.0, "grad_norm": 1.8454262604987328, "language_loss": 0.69932103, "learning_rate": 3.1013434650595522e-06, "loss": 0.7204352, "num_input_tokens_seen": 112304285, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.5390625, "step": 5230, "time_per_iteration": 2.433551788330078 }, { "auxiliary_loss_clip": 0.0107981, "auxiliary_loss_mlp": 0.01034309, "balance_loss_clip": 1.01765549, "balance_loss_mlp": 1.02458167, "epoch": 0.3145047347061476, "flos": 31354692554880.0, "grad_norm": 1.582479321456293, "language_loss": 0.79186082, "learning_rate": 3.101028068839917e-06, "loss": 0.81300199, "num_input_tokens_seen": 112325110, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.5546875, "step": 5231, "time_per_iteration": 3.946516752243042 }, { "auxiliary_loss_clip": 0.0107863, "auxiliary_loss_mlp": 0.01037219, "balance_loss_clip": 1.02024388, "balance_loss_mlp": 1.02544665, "epoch": 0.31456485795881556, "flos": 10888780919040.0, "grad_norm": 2.0289127983457225, "language_loss": 0.84659767, "learning_rate": 3.100712633327422e-06, "loss": 0.86775613, "num_input_tokens_seen": 112339855, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.53125, "step": 5232, "time_per_iteration": 2.365459442138672 }, { "auxiliary_loss_clip": 0.01079561, "auxiliary_loss_mlp": 0.01035737, "balance_loss_clip": 1.01959586, "balance_loss_mlp": 1.02578616, "epoch": 0.3146249812114835, "flos": 17091485308800.0, "grad_norm": 1.5416621934797834, "language_loss": 0.79673326, "learning_rate": 3.100397158533325e-06, "loss": 0.81788617, "num_input_tokens_seen": 112358480, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.5390625, "step": 5233, "time_per_iteration": 2.371882677078247 }, { "auxiliary_loss_clip": 0.01079983, "auxiliary_loss_mlp": 0.01032709, "balance_loss_clip": 1.01687813, "balance_loss_mlp": 1.02624929, "epoch": 0.3146851044641515, "flos": 55289469479040.0, "grad_norm": 1.6638065143373353, "language_loss": 0.7100246, "learning_rate": 3.100081644468883e-06, "loss": 0.73115146, "num_input_tokens_seen": 112382350, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.53515625, "step": 5234, "time_per_iteration": 4.136966228485107 }, { "auxiliary_loss_clip": 0.01080921, "auxiliary_loss_mlp": 0.01028831, "balance_loss_clip": 1.01270175, "balance_loss_mlp": 1.02584028, "epoch": 0.31474522771681945, "flos": 27014674222080.0, "grad_norm": 2.2056654656185146, "language_loss": 0.72418338, "learning_rate": 3.0997660911453575e-06, "loss": 0.74528086, "num_input_tokens_seen": 112400260, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.55078125, "step": 5235, "time_per_iteration": 3.772786855697632 }, { "auxiliary_loss_clip": 0.0107908, "auxiliary_loss_mlp": 0.0102908, "balance_loss_clip": 1.01344013, "balance_loss_mlp": 1.02459812, "epoch": 0.3148053509694875, "flos": 21250862933760.0, "grad_norm": 1.8033134939001587, "language_loss": 0.78739047, "learning_rate": 3.0994504985740096e-06, "loss": 0.8084721, "num_input_tokens_seen": 112419400, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.54296875, "step": 5236, "time_per_iteration": 2.3732361793518066 }, { "auxiliary_loss_clip": 0.01081067, "auxiliary_loss_mlp": 0.01034409, "balance_loss_clip": 1.01696825, "balance_loss_mlp": 1.02453232, "epoch": 0.31486547422215544, "flos": 31247334524160.0, "grad_norm": 1.6319074967202862, "language_loss": 0.75643516, "learning_rate": 3.099134866766101e-06, "loss": 0.77758992, "num_input_tokens_seen": 112440825, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.56640625, "step": 5237, "time_per_iteration": 2.448544979095459 }, { "auxiliary_loss_clip": 0.01075025, "auxiliary_loss_mlp": 0.01034988, "balance_loss_clip": 1.02017581, "balance_loss_mlp": 1.02330637, "epoch": 0.3149255974748234, "flos": 19827593648640.0, "grad_norm": 2.0433828735521224, "language_loss": 0.79406428, "learning_rate": 3.0988191957328967e-06, "loss": 0.81516439, "num_input_tokens_seen": 112459180, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.51953125, "step": 5238, "time_per_iteration": 2.36114764213562 }, { "auxiliary_loss_clip": 0.0101433, "auxiliary_loss_mlp": 0.01001969, "balance_loss_clip": 1.00028789, "balance_loss_mlp": 1.00214195, "epoch": 0.31498572072749137, "flos": 67680981671040.0, "grad_norm": 0.9531834974421111, "language_loss": 0.67927349, "learning_rate": 3.0985034854856615e-06, "loss": 0.69943643, "num_input_tokens_seen": 112516680, "router_z_loss_clip": 0.0168457, "router_z_loss_mlp": 0.12207031, "step": 5239, "time_per_iteration": 4.398332834243774 }, { "auxiliary_loss_clip": 0.01082341, "auxiliary_loss_mlp": 0.01038409, "balance_loss_clip": 1.02045596, "balance_loss_mlp": 1.02554214, "epoch": 0.31504584398015933, "flos": 19792086929280.0, "grad_norm": 2.0690547521273865, "language_loss": 0.82568109, "learning_rate": 3.098187736035663e-06, "loss": 0.84688854, "num_input_tokens_seen": 112535895, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.56640625, "step": 5240, "time_per_iteration": 2.376136064529419 }, { "auxiliary_loss_clip": 0.01081439, "auxiliary_loss_mlp": 0.01032589, "balance_loss_clip": 1.01651359, "balance_loss_mlp": 1.02748108, "epoch": 0.3151059672328273, "flos": 26614185482880.0, "grad_norm": 1.6884027060147966, "language_loss": 0.81342447, "learning_rate": 3.097871947394168e-06, "loss": 0.83456481, "num_input_tokens_seen": 112557490, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5390625, "step": 5241, "time_per_iteration": 2.44288969039917 }, { "auxiliary_loss_clip": 0.01079048, "auxiliary_loss_mlp": 0.01036551, "balance_loss_clip": 1.0208745, "balance_loss_mlp": 1.02578354, "epoch": 0.31516609048549526, "flos": 24203363080320.0, "grad_norm": 1.7535267418641647, "language_loss": 0.73701584, "learning_rate": 3.0975561195724477e-06, "loss": 0.7581718, "num_input_tokens_seen": 112577075, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.53125, "step": 5242, "time_per_iteration": 2.4050450325012207 }, { "auxiliary_loss_clip": 0.01079953, "auxiliary_loss_mlp": 0.0103126, "balance_loss_clip": 1.01420116, "balance_loss_mlp": 1.02561021, "epoch": 0.31522621373816323, "flos": 25957504120320.0, "grad_norm": 1.877418219253772, "language_loss": 0.7359069, "learning_rate": 3.0972402525817732e-06, "loss": 0.75701892, "num_input_tokens_seen": 112597620, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.54296875, "step": 5243, "time_per_iteration": 2.4118287563323975 }, { "auxiliary_loss_clip": 0.01078961, "auxiliary_loss_mlp": 0.01033932, "balance_loss_clip": 1.01637232, "balance_loss_mlp": 1.02380478, "epoch": 0.3152863369908312, "flos": 21907718853120.0, "grad_norm": 1.7487849442454195, "language_loss": 0.64450109, "learning_rate": 3.0969243464334166e-06, "loss": 0.66562998, "num_input_tokens_seen": 112617150, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.55078125, "step": 5244, "time_per_iteration": 2.405380964279175 }, { "auxiliary_loss_clip": 0.01084078, "auxiliary_loss_mlp": 0.01032981, "balance_loss_clip": 1.0166012, "balance_loss_mlp": 1.02682877, "epoch": 0.31534646024349916, "flos": 16280383536000.0, "grad_norm": 1.9654765948469832, "language_loss": 0.91164446, "learning_rate": 3.0966084011386517e-06, "loss": 0.93281496, "num_input_tokens_seen": 112631090, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.57421875, "step": 5245, "time_per_iteration": 2.3264918327331543 }, { "auxiliary_loss_clip": 0.01081491, "auxiliary_loss_mlp": 0.01041307, "balance_loss_clip": 1.02439117, "balance_loss_mlp": 1.02488089, "epoch": 0.3154065834961671, "flos": 24716097440640.0, "grad_norm": 1.9188636766392912, "language_loss": 0.75167406, "learning_rate": 3.0962924167087526e-06, "loss": 0.77290201, "num_input_tokens_seen": 112651220, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.5703125, "step": 5246, "time_per_iteration": 2.4234580993652344 }, { "auxiliary_loss_clip": 0.01079419, "auxiliary_loss_mlp": 0.01031931, "balance_loss_clip": 1.01505113, "balance_loss_mlp": 1.0246681, "epoch": 0.3154667067488351, "flos": 35369704241280.0, "grad_norm": 1.5425169810359798, "language_loss": 0.61345798, "learning_rate": 3.0959763931549985e-06, "loss": 0.63457149, "num_input_tokens_seen": 112671560, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.546875, "step": 5247, "time_per_iteration": 2.493908166885376 }, { "auxiliary_loss_clip": 0.01080289, "auxiliary_loss_mlp": 0.01032763, "balance_loss_clip": 1.0157752, "balance_loss_mlp": 1.02422643, "epoch": 0.31552683000150306, "flos": 17455524721920.0, "grad_norm": 2.4683848857149053, "language_loss": 0.82290494, "learning_rate": 3.0956603304886653e-06, "loss": 0.84403551, "num_input_tokens_seen": 112689790, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.55859375, "step": 5248, "time_per_iteration": 2.381251096725464 }, { "auxiliary_loss_clip": 0.01078831, "auxiliary_loss_mlp": 0.01045554, "balance_loss_clip": 1.02718377, "balance_loss_mlp": 1.02368248, "epoch": 0.3155869532541711, "flos": 18404778211200.0, "grad_norm": 1.847536235910411, "language_loss": 0.84778982, "learning_rate": 3.095344228721034e-06, "loss": 0.86903369, "num_input_tokens_seen": 112708265, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.55078125, "step": 5249, "time_per_iteration": 2.3564109802246094 }, { "auxiliary_loss_clip": 0.0108296, "auxiliary_loss_mlp": 0.01033904, "balance_loss_clip": 1.01699996, "balance_loss_mlp": 1.02666736, "epoch": 0.31564707650683904, "flos": 21578697400320.0, "grad_norm": 2.1545387065410253, "language_loss": 0.85248566, "learning_rate": 3.0950280878633844e-06, "loss": 0.87365431, "num_input_tokens_seen": 112727820, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.5625, "step": 5250, "time_per_iteration": 2.385845422744751 }, { "auxiliary_loss_clip": 0.01079234, "auxiliary_loss_mlp": 0.01036326, "balance_loss_clip": 1.01895714, "balance_loss_mlp": 1.0229063, "epoch": 0.315707199759507, "flos": 21029967561600.0, "grad_norm": 2.386011396962339, "language_loss": 0.68639684, "learning_rate": 3.094711907926999e-06, "loss": 0.70755243, "num_input_tokens_seen": 112743140, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.5625, "step": 5251, "time_per_iteration": 2.3697569370269775 }, { "auxiliary_loss_clip": 0.01079366, "auxiliary_loss_mlp": 0.01033545, "balance_loss_clip": 1.01743937, "balance_loss_mlp": 1.02507663, "epoch": 0.31576732301217497, "flos": 26827784380800.0, "grad_norm": 2.075415075517344, "language_loss": 0.79322481, "learning_rate": 3.0943956889231613e-06, "loss": 0.81435394, "num_input_tokens_seen": 112764705, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.54296875, "step": 5252, "time_per_iteration": 2.428051233291626 }, { "auxiliary_loss_clip": 0.01079094, "auxiliary_loss_mlp": 0.0103425, "balance_loss_clip": 1.01830542, "balance_loss_mlp": 1.02625227, "epoch": 0.31582744626484294, "flos": 22710057874560.0, "grad_norm": 1.6049140984299957, "language_loss": 0.7427122, "learning_rate": 3.0940794308631574e-06, "loss": 0.76384562, "num_input_tokens_seen": 112785310, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.52734375, "step": 5253, "time_per_iteration": 2.412838935852051 }, { "auxiliary_loss_clip": 0.01079843, "auxiliary_loss_mlp": 0.0103098, "balance_loss_clip": 1.01457703, "balance_loss_mlp": 1.02441907, "epoch": 0.3158875695175109, "flos": 23950766327040.0, "grad_norm": 1.6830502535164042, "language_loss": 0.7342571, "learning_rate": 3.0937631337582723e-06, "loss": 0.75536537, "num_input_tokens_seen": 112802905, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.5546875, "step": 5254, "time_per_iteration": 2.3974599838256836 }, { "auxiliary_loss_clip": 0.01079906, "auxiliary_loss_mlp": 0.01033564, "balance_loss_clip": 1.01619577, "balance_loss_mlp": 1.02538848, "epoch": 0.31594769277017887, "flos": 13261024402560.0, "grad_norm": 1.8619085734771856, "language_loss": 0.77918929, "learning_rate": 3.093446797619795e-06, "loss": 0.80032402, "num_input_tokens_seen": 112820305, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.546875, "step": 5255, "time_per_iteration": 2.3649168014526367 }, { "auxiliary_loss_clip": 0.01076779, "auxiliary_loss_mlp": 0.01033573, "balance_loss_clip": 1.01565611, "balance_loss_mlp": 1.02357996, "epoch": 0.31600781602284683, "flos": 23367123262080.0, "grad_norm": 1.925964166170941, "language_loss": 0.7776494, "learning_rate": 3.093130422459013e-06, "loss": 0.7987529, "num_input_tokens_seen": 112841185, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.53125, "step": 5256, "time_per_iteration": 2.382408380508423 }, { "auxiliary_loss_clip": 0.01074519, "auxiliary_loss_mlp": 0.01037775, "balance_loss_clip": 1.02236092, "balance_loss_mlp": 1.02439499, "epoch": 0.3160679392755148, "flos": 19827558737280.0, "grad_norm": 1.5606482545775244, "language_loss": 0.71487117, "learning_rate": 3.0928140082872194e-06, "loss": 0.7359941, "num_input_tokens_seen": 112860570, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.5, "step": 5257, "time_per_iteration": 2.375041961669922 }, { "auxiliary_loss_clip": 0.01078972, "auxiliary_loss_mlp": 0.01032206, "balance_loss_clip": 1.01505232, "balance_loss_mlp": 1.02506304, "epoch": 0.31612806252818276, "flos": 20192191643520.0, "grad_norm": 2.119419912510088, "language_loss": 0.7674104, "learning_rate": 3.092497555115704e-06, "loss": 0.78852212, "num_input_tokens_seen": 112877975, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.5390625, "step": 5258, "time_per_iteration": 2.3770792484283447 }, { "auxiliary_loss_clip": 0.01081201, "auxiliary_loss_mlp": 0.01038012, "balance_loss_clip": 1.0215013, "balance_loss_mlp": 1.02563679, "epoch": 0.31618818578085073, "flos": 24235029550080.0, "grad_norm": 3.273718426598076, "language_loss": 0.72169727, "learning_rate": 3.0921810629557614e-06, "loss": 0.7428894, "num_input_tokens_seen": 112896170, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.5546875, "step": 5259, "time_per_iteration": 2.395052194595337 }, { "auxiliary_loss_clip": 0.01083789, "auxiliary_loss_mlp": 0.01037654, "balance_loss_clip": 1.0212028, "balance_loss_mlp": 1.02691913, "epoch": 0.3162483090335187, "flos": 25080695435520.0, "grad_norm": 2.5263848532324014, "language_loss": 0.66497993, "learning_rate": 3.0918645318186863e-06, "loss": 0.6861943, "num_input_tokens_seen": 112916180, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.56640625, "step": 5260, "time_per_iteration": 2.4080142974853516 }, { "auxiliary_loss_clip": 0.01078806, "auxiliary_loss_mlp": 0.01030869, "balance_loss_clip": 1.01394165, "balance_loss_mlp": 1.02332568, "epoch": 0.31630843228618666, "flos": 26322171937920.0, "grad_norm": 2.5167465326335106, "language_loss": 0.72138435, "learning_rate": 3.091547961715775e-06, "loss": 0.74248111, "num_input_tokens_seen": 112936745, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.5546875, "step": 5261, "time_per_iteration": 2.4369382858276367 }, { "auxiliary_loss_clip": 0.01015332, "auxiliary_loss_mlp": 0.01006692, "balance_loss_clip": 1.00507116, "balance_loss_mlp": 1.00298214, "epoch": 0.3163685555388547, "flos": 66754839502080.0, "grad_norm": 0.7596495394119944, "language_loss": 0.50580609, "learning_rate": 3.0912313526583237e-06, "loss": 0.52602631, "num_input_tokens_seen": 112994845, "router_z_loss_clip": 0.01623535, "router_z_loss_mlp": 0.12353516, "step": 5262, "time_per_iteration": 3.0339443683624268 }, { "auxiliary_loss_clip": 0.01082344, "auxiliary_loss_mlp": 0.01031569, "balance_loss_clip": 1.01476026, "balance_loss_mlp": 1.02562666, "epoch": 0.31642867879152264, "flos": 25994442205440.0, "grad_norm": 1.4456440908159507, "language_loss": 0.85142934, "learning_rate": 3.0909147046576333e-06, "loss": 0.87256849, "num_input_tokens_seen": 113015125, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.56640625, "step": 5263, "time_per_iteration": 2.4157674312591553 }, { "auxiliary_loss_clip": 0.01076742, "auxiliary_loss_mlp": 0.01034652, "balance_loss_clip": 1.01944113, "balance_loss_mlp": 1.02456498, "epoch": 0.3164888020441906, "flos": 25773791212800.0, "grad_norm": 1.9295556132749418, "language_loss": 0.82008076, "learning_rate": 3.0905980177250026e-06, "loss": 0.84119469, "num_input_tokens_seen": 113035535, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.5234375, "step": 5264, "time_per_iteration": 2.426569700241089 }, { "auxiliary_loss_clip": 0.01084145, "auxiliary_loss_mlp": 0.01033681, "balance_loss_clip": 1.01701522, "balance_loss_mlp": 1.02686477, "epoch": 0.3165489252968586, "flos": 19755183755520.0, "grad_norm": 1.8006424878874159, "language_loss": 0.79693788, "learning_rate": 3.090281291871734e-06, "loss": 0.81811619, "num_input_tokens_seen": 113052720, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.5703125, "step": 5265, "time_per_iteration": 2.392524480819702 }, { "auxiliary_loss_clip": 0.01082602, "auxiliary_loss_mlp": 0.01033871, "balance_loss_clip": 1.01547682, "balance_loss_mlp": 1.02509403, "epoch": 0.31660904854952654, "flos": 23182851772800.0, "grad_norm": 1.5148958739692417, "language_loss": 0.74676967, "learning_rate": 3.089964527109131e-06, "loss": 0.76793444, "num_input_tokens_seen": 113071435, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.578125, "step": 5266, "time_per_iteration": 2.407978057861328 }, { "auxiliary_loss_clip": 0.01079704, "auxiliary_loss_mlp": 0.01036665, "balance_loss_clip": 1.01959407, "balance_loss_mlp": 1.02397108, "epoch": 0.3166691718021945, "flos": 20407571020800.0, "grad_norm": 1.996167646815778, "language_loss": 0.79371524, "learning_rate": 3.0896477234484976e-06, "loss": 0.81487888, "num_input_tokens_seen": 113088645, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.55859375, "step": 5267, "time_per_iteration": 2.361037015914917 }, { "auxiliary_loss_clip": 0.01013807, "auxiliary_loss_mlp": 0.01000798, "balance_loss_clip": 0.99931991, "balance_loss_mlp": 1.00158882, "epoch": 0.31672929505486247, "flos": 70141275336960.0, "grad_norm": 0.729997743554358, "language_loss": 0.5780766, "learning_rate": 3.08933088090114e-06, "loss": 0.59822267, "num_input_tokens_seen": 113152775, "router_z_loss_clip": 0.01477051, "router_z_loss_mlp": 0.12207031, "step": 5268, "time_per_iteration": 3.0364322662353516 }, { "auxiliary_loss_clip": 0.01080023, "auxiliary_loss_mlp": 0.01030453, "balance_loss_clip": 1.01247668, "balance_loss_mlp": 1.02402115, "epoch": 0.31678941830753043, "flos": 14354888209920.0, "grad_norm": 2.54358297012882, "language_loss": 0.73169315, "learning_rate": 3.0890139994783653e-06, "loss": 0.75279784, "num_input_tokens_seen": 113171410, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.5625, "step": 5269, "time_per_iteration": 2.376150131225586 }, { "auxiliary_loss_clip": 0.01080085, "auxiliary_loss_mlp": 0.01040108, "balance_loss_clip": 1.02297807, "balance_loss_mlp": 1.02438664, "epoch": 0.3168495415601984, "flos": 22746611934720.0, "grad_norm": 1.8357253850565918, "language_loss": 0.79865623, "learning_rate": 3.0886970791914822e-06, "loss": 0.81985819, "num_input_tokens_seen": 113189965, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.55859375, "step": 5270, "time_per_iteration": 3.775099515914917 }, { "auxiliary_loss_clip": 0.01083041, "auxiliary_loss_mlp": 0.01042041, "balance_loss_clip": 1.02405214, "balance_loss_mlp": 1.02541685, "epoch": 0.31690966481286637, "flos": 20114370489600.0, "grad_norm": 2.166272483725908, "language_loss": 0.79255253, "learning_rate": 3.088380120051801e-06, "loss": 0.81380343, "num_input_tokens_seen": 113206355, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.578125, "step": 5271, "time_per_iteration": 2.378732442855835 }, { "auxiliary_loss_clip": 0.01081302, "auxiliary_loss_mlp": 0.01030256, "balance_loss_clip": 1.0131495, "balance_loss_mlp": 1.02506793, "epoch": 0.31696978806553433, "flos": 21177859547520.0, "grad_norm": 1.7622399638827526, "language_loss": 0.73040378, "learning_rate": 3.088063122070633e-06, "loss": 0.75151944, "num_input_tokens_seen": 113225440, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.5625, "step": 5272, "time_per_iteration": 2.3977181911468506 }, { "auxiliary_loss_clip": 0.01082461, "auxiliary_loss_mlp": 0.01029965, "balance_loss_clip": 1.01118922, "balance_loss_mlp": 1.02434623, "epoch": 0.3170299113182023, "flos": 42995363713920.0, "grad_norm": 2.223613280288384, "language_loss": 0.69642627, "learning_rate": 3.0877460852592902e-06, "loss": 0.71755052, "num_input_tokens_seen": 113248840, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.58203125, "step": 5273, "time_per_iteration": 3.9881303310394287 }, { "auxiliary_loss_clip": 0.0107984, "auxiliary_loss_mlp": 0.01033855, "balance_loss_clip": 1.0169158, "balance_loss_mlp": 1.02494097, "epoch": 0.31709003457087026, "flos": 24459066944640.0, "grad_norm": 1.6830475698094653, "language_loss": 0.67651677, "learning_rate": 3.0874290096290888e-06, "loss": 0.69765377, "num_input_tokens_seen": 113269630, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.55078125, "step": 5274, "time_per_iteration": 2.4086549282073975 }, { "auxiliary_loss_clip": 0.01076455, "auxiliary_loss_mlp": 0.01037898, "balance_loss_clip": 1.0220077, "balance_loss_mlp": 1.023458, "epoch": 0.3171501578235382, "flos": 24134130120960.0, "grad_norm": 1.711708657366332, "language_loss": 0.80559027, "learning_rate": 3.0871118951913423e-06, "loss": 0.82673383, "num_input_tokens_seen": 113291200, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.53125, "step": 5275, "time_per_iteration": 3.776034355163574 }, { "auxiliary_loss_clip": 0.01080137, "auxiliary_loss_mlp": 0.01032483, "balance_loss_clip": 1.01566255, "balance_loss_mlp": 1.02465034, "epoch": 0.31721028107620625, "flos": 18878724184320.0, "grad_norm": 2.2230326829244618, "language_loss": 0.72492123, "learning_rate": 3.0867947419573693e-06, "loss": 0.7460475, "num_input_tokens_seen": 113310170, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.5546875, "step": 5276, "time_per_iteration": 2.359403133392334 }, { "auxiliary_loss_clip": 0.01076312, "auxiliary_loss_mlp": 0.01027512, "balance_loss_clip": 1.01140654, "balance_loss_mlp": 1.0237422, "epoch": 0.3172704043288742, "flos": 23146786471680.0, "grad_norm": 1.4496608106011593, "language_loss": 0.78041995, "learning_rate": 3.0864775499384873e-06, "loss": 0.80145818, "num_input_tokens_seen": 113331140, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.5234375, "step": 5277, "time_per_iteration": 2.4496803283691406 }, { "auxiliary_loss_clip": 0.01080901, "auxiliary_loss_mlp": 0.01036499, "balance_loss_clip": 1.01834309, "balance_loss_mlp": 1.02437401, "epoch": 0.3173305275815422, "flos": 17857549560960.0, "grad_norm": 1.7043471396870504, "language_loss": 0.79108131, "learning_rate": 3.086160319146016e-06, "loss": 0.81225532, "num_input_tokens_seen": 113350030, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.56640625, "step": 5278, "time_per_iteration": 3.7243001461029053 }, { "auxiliary_loss_clip": 0.01013157, "auxiliary_loss_mlp": 0.01002575, "balance_loss_clip": 1.00128186, "balance_loss_mlp": 1.00111103, "epoch": 0.31739065083421014, "flos": 59971042442880.0, "grad_norm": 0.8719670676254162, "language_loss": 0.62874103, "learning_rate": 3.0858430495912772e-06, "loss": 0.64889824, "num_input_tokens_seen": 113395820, "router_z_loss_clip": 0.01293945, "router_z_loss_mlp": 0.12011719, "step": 5279, "time_per_iteration": 2.790759801864624 }, { "auxiliary_loss_clip": 0.01084219, "auxiliary_loss_mlp": 0.01040203, "balance_loss_clip": 1.02104592, "balance_loss_mlp": 1.02513933, "epoch": 0.3174507740868781, "flos": 23799976698240.0, "grad_norm": 1.67235337841515, "language_loss": 0.81060869, "learning_rate": 3.0855257412855933e-06, "loss": 0.83185291, "num_input_tokens_seen": 113416835, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.58984375, "step": 5280, "time_per_iteration": 2.4147047996520996 }, { "auxiliary_loss_clip": 0.01081347, "auxiliary_loss_mlp": 0.01042717, "balance_loss_clip": 1.0264926, "balance_loss_mlp": 1.02539849, "epoch": 0.31751089733954607, "flos": 27637594433280.0, "grad_norm": 1.581023055194415, "language_loss": 0.78002334, "learning_rate": 3.0852083942402874e-06, "loss": 0.80126405, "num_input_tokens_seen": 113440850, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.55859375, "step": 5281, "time_per_iteration": 2.471285104751587 }, { "auxiliary_loss_clip": 0.01080128, "auxiliary_loss_mlp": 0.01030832, "balance_loss_clip": 1.01379657, "balance_loss_mlp": 1.02490568, "epoch": 0.31757102059221404, "flos": 23768135671680.0, "grad_norm": 1.6327436154783515, "language_loss": 0.78271222, "learning_rate": 3.084891008466686e-06, "loss": 0.8038218, "num_input_tokens_seen": 113461000, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.5546875, "step": 5282, "time_per_iteration": 2.39589524269104 }, { "auxiliary_loss_clip": 0.01082877, "auxiliary_loss_mlp": 0.01040751, "balance_loss_clip": 1.02387059, "balance_loss_mlp": 1.02483785, "epoch": 0.317631143844882, "flos": 25263361002240.0, "grad_norm": 1.9959048398359513, "language_loss": 0.67214715, "learning_rate": 3.0845735839761145e-06, "loss": 0.69338346, "num_input_tokens_seen": 113480820, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.578125, "step": 5283, "time_per_iteration": 2.403817653656006 }, { "auxiliary_loss_clip": 0.01013771, "auxiliary_loss_mlp": 0.01001144, "balance_loss_clip": 0.99983269, "balance_loss_mlp": 1.00153661, "epoch": 0.31769126709754997, "flos": 55823290300800.0, "grad_norm": 0.7362289128213836, "language_loss": 0.52765673, "learning_rate": 3.084256120779902e-06, "loss": 0.54780585, "num_input_tokens_seen": 113536910, "router_z_loss_clip": 0.01312256, "router_z_loss_mlp": 0.12207031, "step": 5284, "time_per_iteration": 2.9949584007263184 }, { "auxiliary_loss_clip": 0.01085186, "auxiliary_loss_mlp": 0.010419, "balance_loss_clip": 1.02481723, "balance_loss_mlp": 1.02751923, "epoch": 0.31775139035021793, "flos": 16689635026560.0, "grad_norm": 2.288646676387672, "language_loss": 0.69851232, "learning_rate": 3.0839386188893777e-06, "loss": 0.71978313, "num_input_tokens_seen": 113555480, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.578125, "step": 5285, "time_per_iteration": 2.3910608291625977 }, { "auxiliary_loss_clip": 0.01013096, "auxiliary_loss_mlp": 0.0100228, "balance_loss_clip": 1.00102246, "balance_loss_mlp": 1.00139141, "epoch": 0.3178115136028859, "flos": 64224090979200.0, "grad_norm": 0.813032882130715, "language_loss": 0.60546649, "learning_rate": 3.083621078315872e-06, "loss": 0.62562025, "num_input_tokens_seen": 113616790, "router_z_loss_clip": 0.01257324, "router_z_loss_mlp": 0.1171875, "step": 5286, "time_per_iteration": 3.0806162357330322 }, { "auxiliary_loss_clip": 0.01082214, "auxiliary_loss_mlp": 0.01037579, "balance_loss_clip": 1.02079391, "balance_loss_mlp": 1.0257957, "epoch": 0.31787163685555386, "flos": 18696477553920.0, "grad_norm": 1.6527688333029478, "language_loss": 0.71768641, "learning_rate": 3.083303499070718e-06, "loss": 0.73888433, "num_input_tokens_seen": 113635320, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.5625, "step": 5287, "time_per_iteration": 2.3819234371185303 }, { "auxiliary_loss_clip": 0.01080167, "auxiliary_loss_mlp": 0.01037996, "balance_loss_clip": 1.01981616, "balance_loss_mlp": 1.02329683, "epoch": 0.31793176010822183, "flos": 21323691763200.0, "grad_norm": 1.8911901920569434, "language_loss": 0.75618762, "learning_rate": 3.082985881165248e-06, "loss": 0.77736926, "num_input_tokens_seen": 113654000, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.5703125, "step": 5288, "time_per_iteration": 2.3669891357421875 }, { "auxiliary_loss_clip": 0.01075516, "auxiliary_loss_mlp": 0.010258, "balance_loss_clip": 1.01118445, "balance_loss_mlp": 1.02206612, "epoch": 0.31799188336088985, "flos": 20957662402560.0, "grad_norm": 1.646249100576446, "language_loss": 0.87391573, "learning_rate": 3.082668224610798e-06, "loss": 0.89492893, "num_input_tokens_seen": 113672375, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.53515625, "step": 5289, "time_per_iteration": 2.394063711166382 }, { "auxiliary_loss_clip": 0.01078334, "auxiliary_loss_mlp": 0.01030259, "balance_loss_clip": 1.01497591, "balance_loss_mlp": 1.02536571, "epoch": 0.3180520066135578, "flos": 22490838247680.0, "grad_norm": 2.3050482578316975, "language_loss": 0.67488748, "learning_rate": 3.0823505294187044e-06, "loss": 0.6959734, "num_input_tokens_seen": 113692385, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.53125, "step": 5290, "time_per_iteration": 2.390932321548462 }, { "auxiliary_loss_clip": 0.01080873, "auxiliary_loss_mlp": 0.01032342, "balance_loss_clip": 1.01449621, "balance_loss_mlp": 1.02368319, "epoch": 0.3181121298662258, "flos": 27234103317120.0, "grad_norm": 2.62376350472759, "language_loss": 0.80194283, "learning_rate": 3.0820327956003045e-06, "loss": 0.82307494, "num_input_tokens_seen": 113712145, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.5703125, "step": 5291, "time_per_iteration": 2.4348864555358887 }, { "auxiliary_loss_clip": 0.01079954, "auxiliary_loss_mlp": 0.01033852, "balance_loss_clip": 1.01600647, "balance_loss_mlp": 1.02360988, "epoch": 0.31817225311889374, "flos": 23179186080000.0, "grad_norm": 2.0409961214154677, "language_loss": 0.7972188, "learning_rate": 3.0817150231669367e-06, "loss": 0.81835687, "num_input_tokens_seen": 113731435, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.5625, "step": 5292, "time_per_iteration": 2.3765571117401123 }, { "auxiliary_loss_clip": 0.01074242, "auxiliary_loss_mlp": 0.01029241, "balance_loss_clip": 1.01445878, "balance_loss_mlp": 1.02328396, "epoch": 0.3182323763715617, "flos": 23257670549760.0, "grad_norm": 2.314515886392783, "language_loss": 0.74406004, "learning_rate": 3.081397212129943e-06, "loss": 0.76509488, "num_input_tokens_seen": 113750825, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.5078125, "step": 5293, "time_per_iteration": 2.3932366371154785 }, { "auxiliary_loss_clip": 0.01077713, "auxiliary_loss_mlp": 0.01031669, "balance_loss_clip": 1.01651788, "balance_loss_mlp": 1.02501369, "epoch": 0.3182924996242297, "flos": 29015581818240.0, "grad_norm": 6.755118345589732, "language_loss": 0.73712504, "learning_rate": 3.0810793625006637e-06, "loss": 0.75821888, "num_input_tokens_seen": 113770010, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.5234375, "step": 5294, "time_per_iteration": 2.4285929203033447 }, { "auxiliary_loss_clip": 0.01081725, "auxiliary_loss_mlp": 0.01030603, "balance_loss_clip": 1.01303184, "balance_loss_mlp": 1.02475572, "epoch": 0.31835262287689764, "flos": 20448139887360.0, "grad_norm": 4.095635023253408, "language_loss": 0.76049137, "learning_rate": 3.080761474290443e-06, "loss": 0.78161466, "num_input_tokens_seen": 113788640, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.5703125, "step": 5295, "time_per_iteration": 2.374361276626587 }, { "auxiliary_loss_clip": 0.01082446, "auxiliary_loss_mlp": 0.01037101, "balance_loss_clip": 1.02065003, "balance_loss_mlp": 1.02512217, "epoch": 0.3184127461295656, "flos": 25118296836480.0, "grad_norm": 1.6558944150019088, "language_loss": 0.69456697, "learning_rate": 3.0804435475106265e-06, "loss": 0.71576238, "num_input_tokens_seen": 113809515, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.57421875, "step": 5296, "time_per_iteration": 2.413593292236328 }, { "auxiliary_loss_clip": 0.01079503, "auxiliary_loss_mlp": 0.01032717, "balance_loss_clip": 1.01682687, "balance_loss_mlp": 1.02460003, "epoch": 0.31847286938223357, "flos": 25550207665920.0, "grad_norm": 1.7147605209163255, "language_loss": 0.77639103, "learning_rate": 3.0801255821725578e-06, "loss": 0.79751325, "num_input_tokens_seen": 113829770, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.546875, "step": 5297, "time_per_iteration": 2.4264285564422607 }, { "auxiliary_loss_clip": 0.01077453, "auxiliary_loss_mlp": 0.01028859, "balance_loss_clip": 1.01299191, "balance_loss_mlp": 1.02425241, "epoch": 0.31853299263490154, "flos": 27781227233280.0, "grad_norm": 2.6881575346058066, "language_loss": 0.79487884, "learning_rate": 3.0798075782875854e-06, "loss": 0.81594205, "num_input_tokens_seen": 113849320, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.53515625, "step": 5298, "time_per_iteration": 2.4228458404541016 }, { "auxiliary_loss_clip": 0.01080002, "auxiliary_loss_mlp": 0.01038655, "balance_loss_clip": 1.0229069, "balance_loss_mlp": 1.0242033, "epoch": 0.3185931158875695, "flos": 22705763777280.0, "grad_norm": 1.6278739612813984, "language_loss": 0.74050403, "learning_rate": 3.0794895358670587e-06, "loss": 0.76169056, "num_input_tokens_seen": 113867860, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.55859375, "step": 5299, "time_per_iteration": 2.3948004245758057 }, { "auxiliary_loss_clip": 0.01081397, "auxiliary_loss_mlp": 0.01034239, "balance_loss_clip": 1.01760912, "balance_loss_mlp": 1.0242449, "epoch": 0.31865323914023747, "flos": 24570369959040.0, "grad_norm": 2.19396212851942, "language_loss": 0.78422546, "learning_rate": 3.079171454922327e-06, "loss": 0.80538183, "num_input_tokens_seen": 113886375, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.5703125, "step": 5300, "time_per_iteration": 2.3989174365997314 }, { "auxiliary_loss_clip": 0.01079522, "auxiliary_loss_mlp": 0.01033003, "balance_loss_clip": 1.01617098, "balance_loss_mlp": 1.02372551, "epoch": 0.31871336239290543, "flos": 18185593495680.0, "grad_norm": 1.9696395749252553, "language_loss": 0.84092903, "learning_rate": 3.0788533354647425e-06, "loss": 0.86205423, "num_input_tokens_seen": 113904065, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.55859375, "step": 5301, "time_per_iteration": 2.3700175285339355 }, { "auxiliary_loss_clip": 0.01079199, "auxiliary_loss_mlp": 0.01037238, "balance_loss_clip": 1.02050066, "balance_loss_mlp": 1.02565467, "epoch": 0.31877348564557345, "flos": 21825917804160.0, "grad_norm": 2.099518613113411, "language_loss": 0.77145398, "learning_rate": 3.078535177505657e-06, "loss": 0.79261839, "num_input_tokens_seen": 113918415, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.53515625, "step": 5302, "time_per_iteration": 2.3479936122894287 }, { "auxiliary_loss_clip": 0.01074655, "auxiliary_loss_mlp": 0.010356, "balance_loss_clip": 1.01954257, "balance_loss_mlp": 1.02352607, "epoch": 0.3188336088982414, "flos": 22014239011200.0, "grad_norm": 1.6542924805411356, "language_loss": 0.78942561, "learning_rate": 3.0782169810564256e-06, "loss": 0.81052822, "num_input_tokens_seen": 113938135, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.5078125, "step": 5303, "time_per_iteration": 2.408334493637085 }, { "auxiliary_loss_clip": 0.01084304, "auxiliary_loss_mlp": 0.01041492, "balance_loss_clip": 1.02434969, "balance_loss_mlp": 1.02641249, "epoch": 0.3188937321509094, "flos": 20046848186880.0, "grad_norm": 2.0001558558790142, "language_loss": 0.72907531, "learning_rate": 3.0778987461284035e-06, "loss": 0.75033325, "num_input_tokens_seen": 113957125, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.578125, "step": 5304, "time_per_iteration": 2.374227523803711 }, { "auxiliary_loss_clip": 0.01076543, "auxiliary_loss_mlp": 0.01033692, "balance_loss_clip": 1.01899385, "balance_loss_mlp": 1.02409315, "epoch": 0.31895385540357735, "flos": 25846934244480.0, "grad_norm": 1.952239126447074, "language_loss": 0.72067142, "learning_rate": 3.077580472732948e-06, "loss": 0.74177378, "num_input_tokens_seen": 113974875, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.5234375, "step": 5305, "time_per_iteration": 2.4203715324401855 }, { "auxiliary_loss_clip": 0.01078933, "auxiliary_loss_mlp": 0.0103289, "balance_loss_clip": 1.01834631, "balance_loss_mlp": 1.0255084, "epoch": 0.3190139786562453, "flos": 23476575974400.0, "grad_norm": 1.710631185794342, "language_loss": 0.6401546, "learning_rate": 3.077262160881417e-06, "loss": 0.66127276, "num_input_tokens_seen": 113994450, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.53515625, "step": 5306, "time_per_iteration": 2.393486976623535 }, { "auxiliary_loss_clip": 0.0108025, "auxiliary_loss_mlp": 0.01034127, "balance_loss_clip": 1.01727045, "balance_loss_mlp": 1.0257448, "epoch": 0.3190741019089133, "flos": 29094275756160.0, "grad_norm": 1.9610930205302717, "language_loss": 0.7924794, "learning_rate": 3.07694381058517e-06, "loss": 0.81362319, "num_input_tokens_seen": 114013945, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.546875, "step": 5307, "time_per_iteration": 2.442568063735962 }, { "auxiliary_loss_clip": 0.01074879, "auxiliary_loss_mlp": 0.01031194, "balance_loss_clip": 1.01525021, "balance_loss_mlp": 1.02398574, "epoch": 0.31913422516158124, "flos": 17128563039360.0, "grad_norm": 1.662184571938068, "language_loss": 0.77445477, "learning_rate": 3.07662542185557e-06, "loss": 0.79551548, "num_input_tokens_seen": 114031375, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5078125, "step": 5308, "time_per_iteration": 2.352698802947998 }, { "auxiliary_loss_clip": 0.01086511, "auxiliary_loss_mlp": 0.01034301, "balance_loss_clip": 1.01606214, "balance_loss_mlp": 1.02551162, "epoch": 0.3191943484142492, "flos": 16068949142400.0, "grad_norm": 2.2970064580292457, "language_loss": 0.73742926, "learning_rate": 3.0763069947039774e-06, "loss": 0.75863743, "num_input_tokens_seen": 114048465, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.609375, "step": 5309, "time_per_iteration": 2.354645252227783 }, { "auxiliary_loss_clip": 0.01079275, "auxiliary_loss_mlp": 0.01029999, "balance_loss_clip": 1.01563466, "balance_loss_mlp": 1.02524686, "epoch": 0.3192544716669172, "flos": 22965063511680.0, "grad_norm": 2.175398532649715, "language_loss": 0.82634771, "learning_rate": 3.0759885291417574e-06, "loss": 0.84744048, "num_input_tokens_seen": 114068415, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.5390625, "step": 5310, "time_per_iteration": 3.7492876052856445 }, { "auxiliary_loss_clip": 0.01077036, "auxiliary_loss_mlp": 0.01031986, "balance_loss_clip": 1.01672697, "balance_loss_mlp": 1.02340722, "epoch": 0.31931459491958514, "flos": 26869121297280.0, "grad_norm": 1.3851259056368521, "language_loss": 0.78265637, "learning_rate": 3.0756700251802745e-06, "loss": 0.80374658, "num_input_tokens_seen": 114088565, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.5390625, "step": 5311, "time_per_iteration": 2.4239578247070312 }, { "auxiliary_loss_clip": 0.0107738, "auxiliary_loss_mlp": 0.01033138, "balance_loss_clip": 1.01640153, "balance_loss_mlp": 1.02321672, "epoch": 0.3193747181722531, "flos": 21835413694080.0, "grad_norm": 1.7725353980341623, "language_loss": 0.84363174, "learning_rate": 3.0753514828308942e-06, "loss": 0.86473691, "num_input_tokens_seen": 114107160, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.5390625, "step": 5312, "time_per_iteration": 2.3842127323150635 }, { "auxiliary_loss_clip": 0.01081476, "auxiliary_loss_mlp": 0.01035483, "balance_loss_clip": 1.01817369, "balance_loss_mlp": 1.02505708, "epoch": 0.31943484142492107, "flos": 18324233971200.0, "grad_norm": 2.2289129262842793, "language_loss": 0.78590673, "learning_rate": 3.0750329021049863e-06, "loss": 0.80707633, "num_input_tokens_seen": 114123420, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.56640625, "step": 5313, "time_per_iteration": 3.7440524101257324 }, { "auxiliary_loss_clip": 0.01076528, "auxiliary_loss_mlp": 0.01028799, "balance_loss_clip": 1.01303971, "balance_loss_mlp": 1.02394986, "epoch": 0.31949496467758903, "flos": 21614762701440.0, "grad_norm": 2.69514675467478, "language_loss": 0.86007148, "learning_rate": 3.074714283013919e-06, "loss": 0.88112479, "num_input_tokens_seen": 114139230, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.5234375, "step": 5314, "time_per_iteration": 3.768296003341675 }, { "auxiliary_loss_clip": 0.01077563, "auxiliary_loss_mlp": 0.01031038, "balance_loss_clip": 1.01520741, "balance_loss_mlp": 1.0241971, "epoch": 0.31955508793025705, "flos": 21759198462720.0, "grad_norm": 2.3285616252378714, "language_loss": 0.79701555, "learning_rate": 3.074395625569064e-06, "loss": 0.81810158, "num_input_tokens_seen": 114159290, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.53515625, "step": 5315, "time_per_iteration": 2.379410743713379 }, { "auxiliary_loss_clip": 0.01080402, "auxiliary_loss_mlp": 0.01033015, "balance_loss_clip": 1.01719558, "balance_loss_mlp": 1.02462316, "epoch": 0.319615211182925, "flos": 17163406442880.0, "grad_norm": 1.660955287492213, "language_loss": 0.68103802, "learning_rate": 3.074076929781793e-06, "loss": 0.70217216, "num_input_tokens_seen": 114177655, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.55859375, "step": 5316, "time_per_iteration": 2.3701882362365723 }, { "auxiliary_loss_clip": 0.01078854, "auxiliary_loss_mlp": 0.01032013, "balance_loss_clip": 1.01768994, "balance_loss_mlp": 1.02430809, "epoch": 0.319675334435593, "flos": 28111505495040.0, "grad_norm": 1.895388721289121, "language_loss": 0.6942125, "learning_rate": 3.073758195663479e-06, "loss": 0.71532118, "num_input_tokens_seen": 114200880, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.546875, "step": 5317, "time_per_iteration": 2.439493417739868 }, { "auxiliary_loss_clip": 0.01014945, "auxiliary_loss_mlp": 0.01007199, "balance_loss_clip": 1.00601327, "balance_loss_mlp": 1.00331116, "epoch": 0.31973545768826095, "flos": 69497266798080.0, "grad_norm": 0.7302130338257007, "language_loss": 0.53033507, "learning_rate": 3.0734394232254967e-06, "loss": 0.55055654, "num_input_tokens_seen": 114267145, "router_z_loss_clip": 0.01184082, "router_z_loss_mlp": 0.11621094, "step": 5318, "time_per_iteration": 4.543852090835571 }, { "auxiliary_loss_clip": 0.01077381, "auxiliary_loss_mlp": 0.01032746, "balance_loss_clip": 1.01823258, "balance_loss_mlp": 1.02470493, "epoch": 0.3197955809409289, "flos": 13698346492800.0, "grad_norm": 3.015290669098223, "language_loss": 0.8383435, "learning_rate": 3.0731206124792225e-06, "loss": 0.85944486, "num_input_tokens_seen": 114284630, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.52734375, "step": 5319, "time_per_iteration": 2.3598384857177734 }, { "auxiliary_loss_clip": 0.01077311, "auxiliary_loss_mlp": 0.01037845, "balance_loss_clip": 1.02203143, "balance_loss_mlp": 1.02440691, "epoch": 0.3198557041935969, "flos": 33216750207360.0, "grad_norm": 2.114462507644159, "language_loss": 0.63758969, "learning_rate": 3.0728017634360345e-06, "loss": 0.65874124, "num_input_tokens_seen": 114305830, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.53125, "step": 5320, "time_per_iteration": 2.4845988750457764 }, { "auxiliary_loss_clip": 0.01082309, "auxiliary_loss_mlp": 0.01034486, "balance_loss_clip": 1.01903701, "balance_loss_mlp": 1.02537179, "epoch": 0.31991582744626484, "flos": 23730918295680.0, "grad_norm": 1.8644225581946614, "language_loss": 0.71038461, "learning_rate": 3.072482876107311e-06, "loss": 0.7315526, "num_input_tokens_seen": 114325165, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.5703125, "step": 5321, "time_per_iteration": 2.394752264022827 }, { "auxiliary_loss_clip": 0.010844, "auxiliary_loss_mlp": 0.01037776, "balance_loss_clip": 1.02035344, "balance_loss_mlp": 1.02610743, "epoch": 0.3199759506989328, "flos": 18549877288320.0, "grad_norm": 2.51645326805439, "language_loss": 0.86026931, "learning_rate": 3.072163950504432e-06, "loss": 0.88149107, "num_input_tokens_seen": 114341310, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.58203125, "step": 5322, "time_per_iteration": 2.354313611984253 }, { "auxiliary_loss_clip": 0.01077182, "auxiliary_loss_mlp": 0.01030272, "balance_loss_clip": 1.01500106, "balance_loss_mlp": 1.02371693, "epoch": 0.3200360739516008, "flos": 22417799950080.0, "grad_norm": 1.6746175190444528, "language_loss": 0.8324002, "learning_rate": 3.0718449866387805e-06, "loss": 0.85347468, "num_input_tokens_seen": 114360355, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.53515625, "step": 5323, "time_per_iteration": 2.394345998764038 }, { "auxiliary_loss_clip": 0.0107649, "auxiliary_loss_mlp": 0.01032334, "balance_loss_clip": 1.01686072, "balance_loss_mlp": 1.0236522, "epoch": 0.32009619720426874, "flos": 20594181571200.0, "grad_norm": 1.7991578629235503, "language_loss": 0.78456134, "learning_rate": 3.071525984521738e-06, "loss": 0.80564952, "num_input_tokens_seen": 114379220, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.52734375, "step": 5324, "time_per_iteration": 2.375988006591797 }, { "auxiliary_loss_clip": 0.01077051, "auxiliary_loss_mlp": 0.01029844, "balance_loss_clip": 1.01416802, "balance_loss_mlp": 1.02404213, "epoch": 0.3201563204569367, "flos": 18146735285760.0, "grad_norm": 1.7299068565888955, "language_loss": 0.79906166, "learning_rate": 3.0712069441646896e-06, "loss": 0.82013065, "num_input_tokens_seen": 114396365, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.53125, "step": 5325, "time_per_iteration": 2.3500137329101562 }, { "auxiliary_loss_clip": 0.01079942, "auxiliary_loss_mlp": 0.01033401, "balance_loss_clip": 1.01796389, "balance_loss_mlp": 1.02517414, "epoch": 0.32021644370960467, "flos": 31682945957760.0, "grad_norm": 1.7353914335932255, "language_loss": 0.74854422, "learning_rate": 3.0708878655790207e-06, "loss": 0.7696777, "num_input_tokens_seen": 114416780, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.546875, "step": 5326, "time_per_iteration": 2.473125457763672 }, { "auxiliary_loss_clip": 0.01078257, "auxiliary_loss_mlp": 0.01034268, "balance_loss_clip": 1.0186758, "balance_loss_mlp": 1.02567434, "epoch": 0.32027656696227264, "flos": 26864827200000.0, "grad_norm": 1.7238630991112263, "language_loss": 0.80849326, "learning_rate": 3.070568748776118e-06, "loss": 0.82961857, "num_input_tokens_seen": 114437405, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.52734375, "step": 5327, "time_per_iteration": 2.444814920425415 }, { "auxiliary_loss_clip": 0.01082048, "auxiliary_loss_mlp": 0.01025392, "balance_loss_clip": 1.00933397, "balance_loss_mlp": 1.02633858, "epoch": 0.32033669021494066, "flos": 24168798967680.0, "grad_norm": 1.5346637189642018, "language_loss": 0.77680606, "learning_rate": 3.0702495937673713e-06, "loss": 0.79788041, "num_input_tokens_seen": 114458505, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.5546875, "step": 5328, "time_per_iteration": 2.422175645828247 }, { "auxiliary_loss_clip": 0.01079682, "auxiliary_loss_mlp": 0.0103277, "balance_loss_clip": 1.01577044, "balance_loss_mlp": 1.02425337, "epoch": 0.3203968134676086, "flos": 24459660437760.0, "grad_norm": 1.6415220813276596, "language_loss": 0.74103975, "learning_rate": 3.0699304005641686e-06, "loss": 0.76216424, "num_input_tokens_seen": 114479050, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.5546875, "step": 5329, "time_per_iteration": 2.408017158508301 }, { "auxiliary_loss_clip": 0.0107571, "auxiliary_loss_mlp": 0.0102651, "balance_loss_clip": 1.01276469, "balance_loss_mlp": 1.02347875, "epoch": 0.3204569367202766, "flos": 18003730890240.0, "grad_norm": 1.6329958919204735, "language_loss": 0.70584631, "learning_rate": 3.069611169177903e-06, "loss": 0.72686857, "num_input_tokens_seen": 114497415, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.5234375, "step": 5330, "time_per_iteration": 2.3712151050567627 }, { "auxiliary_loss_clip": 0.01083769, "auxiliary_loss_mlp": 0.01030648, "balance_loss_clip": 1.01284981, "balance_loss_mlp": 1.02580059, "epoch": 0.32051705997294455, "flos": 30588418834560.0, "grad_norm": 1.8023699623724145, "language_loss": 0.79893219, "learning_rate": 3.069291899619966e-06, "loss": 0.82007635, "num_input_tokens_seen": 114518785, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.578125, "step": 5331, "time_per_iteration": 2.4474527835845947 }, { "auxiliary_loss_clip": 0.01014206, "auxiliary_loss_mlp": 0.01005651, "balance_loss_clip": 1.00445902, "balance_loss_mlp": 1.00218296, "epoch": 0.3205771832256125, "flos": 68414855783040.0, "grad_norm": 0.8261636075296693, "language_loss": 0.57823443, "learning_rate": 3.0689725919017517e-06, "loss": 0.59843302, "num_input_tokens_seen": 114577710, "router_z_loss_clip": 0.01190186, "router_z_loss_mlp": 0.12011719, "step": 5332, "time_per_iteration": 2.952845335006714 }, { "auxiliary_loss_clip": 0.01081163, "auxiliary_loss_mlp": 0.01032185, "balance_loss_clip": 1.01600242, "balance_loss_mlp": 1.02478945, "epoch": 0.3206373064782805, "flos": 30442691352960.0, "grad_norm": 1.5109880214283495, "language_loss": 0.73105484, "learning_rate": 3.068653246034655e-06, "loss": 0.75218832, "num_input_tokens_seen": 114598640, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.5625, "step": 5333, "time_per_iteration": 2.4440903663635254 }, { "auxiliary_loss_clip": 0.01079604, "auxiliary_loss_mlp": 0.01037929, "balance_loss_clip": 1.02090633, "balance_loss_mlp": 1.02470422, "epoch": 0.32069742973094845, "flos": 22053411423360.0, "grad_norm": 1.5521675736367253, "language_loss": 0.7039721, "learning_rate": 3.0683338620300728e-06, "loss": 0.72514749, "num_input_tokens_seen": 114618780, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.55078125, "step": 5334, "time_per_iteration": 2.3876729011535645 }, { "auxiliary_loss_clip": 0.01078196, "auxiliary_loss_mlp": 0.01032794, "balance_loss_clip": 1.01683152, "balance_loss_mlp": 1.02403474, "epoch": 0.3207575529836164, "flos": 22052922664320.0, "grad_norm": 1.8526497599509197, "language_loss": 0.775653, "learning_rate": 3.068014439899404e-06, "loss": 0.79676288, "num_input_tokens_seen": 114637525, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.54296875, "step": 5335, "time_per_iteration": 2.3775601387023926 }, { "auxiliary_loss_clip": 0.01078461, "auxiliary_loss_mlp": 0.01030351, "balance_loss_clip": 1.01471066, "balance_loss_mlp": 1.02449226, "epoch": 0.3208176762362844, "flos": 34056132048000.0, "grad_norm": 1.6648730709315847, "language_loss": 0.68008214, "learning_rate": 3.0676949796540458e-06, "loss": 0.70117021, "num_input_tokens_seen": 114659705, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.5390625, "step": 5336, "time_per_iteration": 2.484494924545288 }, { "auxiliary_loss_clip": 0.01080373, "auxiliary_loss_mlp": 0.01034546, "balance_loss_clip": 1.01808918, "balance_loss_mlp": 1.02567554, "epoch": 0.32087779948895234, "flos": 21797637736320.0, "grad_norm": 9.52818582865673, "language_loss": 0.79062629, "learning_rate": 3.067375481305401e-06, "loss": 0.81177551, "num_input_tokens_seen": 114678340, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.546875, "step": 5337, "time_per_iteration": 2.381190538406372 }, { "auxiliary_loss_clip": 0.01073155, "auxiliary_loss_mlp": 0.01030506, "balance_loss_clip": 1.0164988, "balance_loss_mlp": 1.02252102, "epoch": 0.3209379227416203, "flos": 21433039741440.0, "grad_norm": 1.7488093555085955, "language_loss": 0.74007773, "learning_rate": 3.0670559448648707e-06, "loss": 0.76111436, "num_input_tokens_seen": 114696980, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.5078125, "step": 5338, "time_per_iteration": 2.383863687515259 }, { "auxiliary_loss_clip": 0.01079571, "auxiliary_loss_mlp": 0.01028962, "balance_loss_clip": 1.01314247, "balance_loss_mlp": 1.02386236, "epoch": 0.3209980459942883, "flos": 25847876851200.0, "grad_norm": 1.7597184247151785, "language_loss": 0.62648952, "learning_rate": 3.0667363703438588e-06, "loss": 0.64757484, "num_input_tokens_seen": 114717330, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.55859375, "step": 5339, "time_per_iteration": 2.406446695327759 }, { "auxiliary_loss_clip": 0.01078526, "auxiliary_loss_mlp": 0.01031825, "balance_loss_clip": 1.01575601, "balance_loss_mlp": 1.02463818, "epoch": 0.32105816924695624, "flos": 19098153279360.0, "grad_norm": 2.4589022355520385, "language_loss": 0.8209306, "learning_rate": 3.0664167577537696e-06, "loss": 0.8420341, "num_input_tokens_seen": 114736320, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5390625, "step": 5340, "time_per_iteration": 2.366525650024414 }, { "auxiliary_loss_clip": 0.01078453, "auxiliary_loss_mlp": 0.01040906, "balance_loss_clip": 1.02472901, "balance_loss_mlp": 1.02388883, "epoch": 0.3211182924996242, "flos": 16580915452800.0, "grad_norm": 1.8944822944965334, "language_loss": 0.76445788, "learning_rate": 3.0660971071060095e-06, "loss": 0.78565145, "num_input_tokens_seen": 114754575, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.546875, "step": 5341, "time_per_iteration": 2.3488073348999023 }, { "auxiliary_loss_clip": 0.01076127, "auxiliary_loss_mlp": 0.01033629, "balance_loss_clip": 1.01875222, "balance_loss_mlp": 1.02412963, "epoch": 0.3211784157522922, "flos": 22671164753280.0, "grad_norm": 1.6767059818845793, "language_loss": 0.79426581, "learning_rate": 3.0657774184119854e-06, "loss": 0.81536341, "num_input_tokens_seen": 114773590, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.51953125, "step": 5342, "time_per_iteration": 2.406992197036743 }, { "auxiliary_loss_clip": 0.01079338, "auxiliary_loss_mlp": 0.01034497, "balance_loss_clip": 1.01804614, "balance_loss_mlp": 1.02480423, "epoch": 0.3212385390049602, "flos": 20557732245120.0, "grad_norm": 2.854786770329366, "language_loss": 0.75216693, "learning_rate": 3.065457691683108e-06, "loss": 0.7733053, "num_input_tokens_seen": 114790775, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.546875, "step": 5343, "time_per_iteration": 2.3806638717651367 }, { "auxiliary_loss_clip": 0.01077548, "auxiliary_loss_mlp": 0.01031206, "balance_loss_clip": 1.01543427, "balance_loss_mlp": 1.02391636, "epoch": 0.32129866225762815, "flos": 24789973610880.0, "grad_norm": 2.594467174111739, "language_loss": 0.82471192, "learning_rate": 3.0651379269307853e-06, "loss": 0.84579945, "num_input_tokens_seen": 114809835, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.53515625, "step": 5344, "time_per_iteration": 2.4152557849884033 }, { "auxiliary_loss_clip": 0.01078085, "auxiliary_loss_mlp": 0.01033955, "balance_loss_clip": 1.01802278, "balance_loss_mlp": 1.02296662, "epoch": 0.3213587855102961, "flos": 18365954912640.0, "grad_norm": 1.9454076014182453, "language_loss": 0.79905093, "learning_rate": 3.06481812416643e-06, "loss": 0.8201713, "num_input_tokens_seen": 114826505, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.55078125, "step": 5345, "time_per_iteration": 2.339587926864624 }, { "auxiliary_loss_clip": 0.01077468, "auxiliary_loss_mlp": 0.01035704, "balance_loss_clip": 1.02013516, "balance_loss_mlp": 1.02376652, "epoch": 0.3214189087629641, "flos": 27014778956160.0, "grad_norm": 1.6835044905221574, "language_loss": 0.82979214, "learning_rate": 3.0644982834014545e-06, "loss": 0.85092378, "num_input_tokens_seen": 114846140, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.5390625, "step": 5346, "time_per_iteration": 2.42110538482666 }, { "auxiliary_loss_clip": 0.01078451, "auxiliary_loss_mlp": 0.01034962, "balance_loss_clip": 1.01922059, "balance_loss_mlp": 1.0234127, "epoch": 0.32147903201563205, "flos": 23147170496640.0, "grad_norm": 1.461422133704922, "language_loss": 0.8155455, "learning_rate": 3.0641784046472745e-06, "loss": 0.83667964, "num_input_tokens_seen": 114866660, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.55078125, "step": 5347, "time_per_iteration": 2.4126431941986084 }, { "auxiliary_loss_clip": 0.01077756, "auxiliary_loss_mlp": 0.01032444, "balance_loss_clip": 1.01629162, "balance_loss_mlp": 1.02431095, "epoch": 0.3215391552683, "flos": 16579833200640.0, "grad_norm": 2.213795023887685, "language_loss": 0.79823768, "learning_rate": 3.063858487915304e-06, "loss": 0.81933963, "num_input_tokens_seen": 114882820, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.53515625, "step": 5348, "time_per_iteration": 2.358952760696411 }, { "auxiliary_loss_clip": 0.01080646, "auxiliary_loss_mlp": 0.01039472, "balance_loss_clip": 1.02431464, "balance_loss_mlp": 1.02657938, "epoch": 0.321599278520968, "flos": 17820855855360.0, "grad_norm": 1.9932386942977218, "language_loss": 0.84975469, "learning_rate": 3.0635385332169606e-06, "loss": 0.87095582, "num_input_tokens_seen": 114900745, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.5390625, "step": 5349, "time_per_iteration": 3.757690191268921 }, { "auxiliary_loss_clip": 0.01076711, "auxiliary_loss_mlp": 0.0103356, "balance_loss_clip": 1.01833105, "balance_loss_mlp": 1.02378941, "epoch": 0.32165940177363594, "flos": 16250881570560.0, "grad_norm": 1.612803389003936, "language_loss": 0.80543709, "learning_rate": 3.063218540563663e-06, "loss": 0.82653975, "num_input_tokens_seen": 114917940, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.52734375, "step": 5350, "time_per_iteration": 2.3480212688446045 }, { "auxiliary_loss_clip": 0.010773, "auxiliary_loss_mlp": 0.01031107, "balance_loss_clip": 1.01612234, "balance_loss_mlp": 1.0239172, "epoch": 0.3217195250263039, "flos": 27598666400640.0, "grad_norm": 1.4459224358230414, "language_loss": 0.80121368, "learning_rate": 3.06289850996683e-06, "loss": 0.82229781, "num_input_tokens_seen": 114937735, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.53125, "step": 5351, "time_per_iteration": 2.420304298400879 }, { "auxiliary_loss_clip": 0.01078208, "auxiliary_loss_mlp": 0.01035006, "balance_loss_clip": 1.01911521, "balance_loss_mlp": 1.02447629, "epoch": 0.3217796482789719, "flos": 21469523978880.0, "grad_norm": 1.7297810608015358, "language_loss": 0.75632811, "learning_rate": 3.062578441437884e-06, "loss": 0.77746028, "num_input_tokens_seen": 114956630, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.5390625, "step": 5352, "time_per_iteration": 2.3908603191375732 }, { "auxiliary_loss_clip": 0.01078515, "auxiliary_loss_mlp": 0.01034806, "balance_loss_clip": 1.01867068, "balance_loss_mlp": 1.0249629, "epoch": 0.32183977153163984, "flos": 21214518341760.0, "grad_norm": 3.985252892116223, "language_loss": 0.8181082, "learning_rate": 3.062258334988246e-06, "loss": 0.83924145, "num_input_tokens_seen": 114976470, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.53515625, "step": 5353, "time_per_iteration": 5.1726415157318115 }, { "auxiliary_loss_clip": 0.01075771, "auxiliary_loss_mlp": 0.01032235, "balance_loss_clip": 1.01659489, "balance_loss_mlp": 1.02447271, "epoch": 0.3218998947843078, "flos": 24607028753280.0, "grad_norm": 1.5477413546729974, "language_loss": 0.73335499, "learning_rate": 3.0619381906293414e-06, "loss": 0.75443506, "num_input_tokens_seen": 114996710, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.51171875, "step": 5354, "time_per_iteration": 2.395334005355835 }, { "auxiliary_loss_clip": 0.01076708, "auxiliary_loss_mlp": 0.01031507, "balance_loss_clip": 1.01655841, "balance_loss_mlp": 1.0236063, "epoch": 0.3219600180369758, "flos": 22269558850560.0, "grad_norm": 1.5242890369922673, "language_loss": 0.83404744, "learning_rate": 3.0616180083725943e-06, "loss": 0.85512954, "num_input_tokens_seen": 115015775, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.53125, "step": 5355, "time_per_iteration": 2.397935152053833 }, { "auxiliary_loss_clip": 0.01083671, "auxiliary_loss_mlp": 0.01030719, "balance_loss_clip": 1.0153172, "balance_loss_mlp": 1.02768159, "epoch": 0.3220201412896438, "flos": 14938251984000.0, "grad_norm": 2.3672932936834035, "language_loss": 0.71493244, "learning_rate": 3.0612977882294306e-06, "loss": 0.73607641, "num_input_tokens_seen": 115034265, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.5625, "step": 5356, "time_per_iteration": 2.342625856399536 }, { "auxiliary_loss_clip": 0.01083924, "auxiliary_loss_mlp": 0.01035437, "balance_loss_clip": 1.01785326, "balance_loss_mlp": 1.02603495, "epoch": 0.32208026454231176, "flos": 22666486631040.0, "grad_norm": 2.537484418368024, "language_loss": 0.67573178, "learning_rate": 3.0609775302112793e-06, "loss": 0.6969254, "num_input_tokens_seen": 115051945, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.578125, "step": 5357, "time_per_iteration": 2.395383596420288 }, { "auxiliary_loss_clip": 0.01017228, "auxiliary_loss_mlp": 0.01002446, "balance_loss_clip": 1.00114655, "balance_loss_mlp": 1.00552416, "epoch": 0.3221403877949797, "flos": 64601606177280.0, "grad_norm": 0.7897995587024933, "language_loss": 0.58244151, "learning_rate": 3.060657234329569e-06, "loss": 0.6026383, "num_input_tokens_seen": 115119090, "router_z_loss_clip": 0.01300049, "router_z_loss_mlp": 0.1171875, "step": 5358, "time_per_iteration": 4.523858308792114 }, { "auxiliary_loss_clip": 0.01077367, "auxiliary_loss_mlp": 0.01035881, "balance_loss_clip": 1.01972818, "balance_loss_mlp": 1.02269447, "epoch": 0.3222005110476477, "flos": 20155986696960.0, "grad_norm": 1.7286052533361376, "language_loss": 0.83615881, "learning_rate": 3.06033690059573e-06, "loss": 0.85729128, "num_input_tokens_seen": 115137755, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.546875, "step": 5359, "time_per_iteration": 2.361799478530884 }, { "auxiliary_loss_clip": 0.01078628, "auxiliary_loss_mlp": 0.01030075, "balance_loss_clip": 1.01460195, "balance_loss_mlp": 1.0245235, "epoch": 0.32226063430031565, "flos": 22673084878080.0, "grad_norm": 1.689715791164159, "language_loss": 0.79500908, "learning_rate": 3.060016529021195e-06, "loss": 0.81609607, "num_input_tokens_seen": 115158150, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.5390625, "step": 5360, "time_per_iteration": 2.4136157035827637 }, { "auxiliary_loss_clip": 0.01014479, "auxiliary_loss_mlp": 0.01000586, "balance_loss_clip": 0.9994117, "balance_loss_mlp": 1.0027318, "epoch": 0.3223207575529836, "flos": 63825312896640.0, "grad_norm": 0.6529925605651171, "language_loss": 0.56954265, "learning_rate": 3.0596961196173965e-06, "loss": 0.58969331, "num_input_tokens_seen": 115212755, "router_z_loss_clip": 0.01171875, "router_z_loss_mlp": 0.11767578, "step": 5361, "time_per_iteration": 2.896439790725708 }, { "auxiliary_loss_clip": 0.0107876, "auxiliary_loss_mlp": 0.01035494, "balance_loss_clip": 1.0193764, "balance_loss_mlp": 1.02444124, "epoch": 0.3223808808056516, "flos": 26868911829120.0, "grad_norm": 5.140079638641671, "language_loss": 0.70762086, "learning_rate": 3.0593756723957695e-06, "loss": 0.72876334, "num_input_tokens_seen": 115233090, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.54296875, "step": 5362, "time_per_iteration": 2.4290249347686768 }, { "auxiliary_loss_clip": 0.01078289, "auxiliary_loss_mlp": 0.0103632, "balance_loss_clip": 1.02153254, "balance_loss_mlp": 1.02497351, "epoch": 0.32244100405831955, "flos": 26760122432640.0, "grad_norm": 1.6110618745369967, "language_loss": 0.73848045, "learning_rate": 3.0590551873677493e-06, "loss": 0.75962651, "num_input_tokens_seen": 115252645, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.53125, "step": 5363, "time_per_iteration": 2.419034242630005 }, { "auxiliary_loss_clip": 0.01082123, "auxiliary_loss_mlp": 0.01037253, "balance_loss_clip": 1.02108836, "balance_loss_mlp": 1.024948, "epoch": 0.3225011273109875, "flos": 23801966645760.0, "grad_norm": 2.0659613830640193, "language_loss": 0.76461691, "learning_rate": 3.058734664544774e-06, "loss": 0.78581065, "num_input_tokens_seen": 115269085, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.57421875, "step": 5364, "time_per_iteration": 2.4025800228118896 }, { "auxiliary_loss_clip": 0.01079656, "auxiliary_loss_mlp": 0.01032721, "balance_loss_clip": 1.01593637, "balance_loss_mlp": 1.02442658, "epoch": 0.3225612505636555, "flos": 17273557382400.0, "grad_norm": 3.239717133902645, "language_loss": 0.77195823, "learning_rate": 3.0584141039382828e-06, "loss": 0.793082, "num_input_tokens_seen": 115286470, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.55078125, "step": 5365, "time_per_iteration": 2.3846027851104736 }, { "auxiliary_loss_clip": 0.01084334, "auxiliary_loss_mlp": 0.01036756, "balance_loss_clip": 1.02034116, "balance_loss_mlp": 1.02883041, "epoch": 0.32262137381632344, "flos": 23365168225920.0, "grad_norm": 1.6422084171036462, "language_loss": 0.76844335, "learning_rate": 3.0580935055597135e-06, "loss": 0.78965425, "num_input_tokens_seen": 115307000, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.5546875, "step": 5366, "time_per_iteration": 2.4153943061828613 }, { "auxiliary_loss_clip": 0.01078505, "auxiliary_loss_mlp": 0.01032185, "balance_loss_clip": 1.01686645, "balance_loss_mlp": 1.02536798, "epoch": 0.3226814970689914, "flos": 23257670549760.0, "grad_norm": 1.8247870818913614, "language_loss": 0.71983856, "learning_rate": 3.057772869420509e-06, "loss": 0.74094546, "num_input_tokens_seen": 115325925, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.53125, "step": 5367, "time_per_iteration": 2.3906137943267822 }, { "auxiliary_loss_clip": 0.01076456, "auxiliary_loss_mlp": 0.01031678, "balance_loss_clip": 1.01684797, "balance_loss_mlp": 1.02413893, "epoch": 0.32274162032165943, "flos": 16394374725120.0, "grad_norm": 2.0400868125342995, "language_loss": 0.7415911, "learning_rate": 3.057452195532112e-06, "loss": 0.76267242, "num_input_tokens_seen": 115343705, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.5234375, "step": 5368, "time_per_iteration": 2.3711793422698975 }, { "auxiliary_loss_clip": 0.01077534, "auxiliary_loss_mlp": 0.01032284, "balance_loss_clip": 1.01760924, "balance_loss_mlp": 1.02591383, "epoch": 0.3228017435743274, "flos": 27853846594560.0, "grad_norm": 1.5648915567637165, "language_loss": 0.78616285, "learning_rate": 3.057131483905967e-06, "loss": 0.80726111, "num_input_tokens_seen": 115364170, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.515625, "step": 5369, "time_per_iteration": 2.441316604614258 }, { "auxiliary_loss_clip": 0.01076838, "auxiliary_loss_mlp": 0.01026979, "balance_loss_clip": 1.01285231, "balance_loss_mlp": 1.02579892, "epoch": 0.32286186682699536, "flos": 19607780528640.0, "grad_norm": 2.135077820571584, "language_loss": 0.83170462, "learning_rate": 3.0568107345535173e-06, "loss": 0.85274273, "num_input_tokens_seen": 115382495, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.51171875, "step": 5370, "time_per_iteration": 2.3850996494293213 }, { "auxiliary_loss_clip": 0.01079691, "auxiliary_loss_mlp": 0.01030886, "balance_loss_clip": 1.01536465, "balance_loss_mlp": 1.02564371, "epoch": 0.3229219900796633, "flos": 24132873312000.0, "grad_norm": 2.4760410996302076, "language_loss": 0.83028758, "learning_rate": 3.0564899474862112e-06, "loss": 0.85139334, "num_input_tokens_seen": 115399450, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.5390625, "step": 5371, "time_per_iteration": 2.4109177589416504 }, { "auxiliary_loss_clip": 0.01082251, "auxiliary_loss_mlp": 0.01037747, "balance_loss_clip": 1.02056861, "balance_loss_mlp": 1.02494669, "epoch": 0.3229821133323313, "flos": 17747747735040.0, "grad_norm": 2.808182109081233, "language_loss": 0.88815355, "learning_rate": 3.056169122715497e-06, "loss": 0.90935355, "num_input_tokens_seen": 115417700, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.5703125, "step": 5372, "time_per_iteration": 2.3465170860290527 }, { "auxiliary_loss_clip": 0.01079449, "auxiliary_loss_mlp": 0.01032666, "balance_loss_clip": 1.01750314, "balance_loss_mlp": 1.0259248, "epoch": 0.32304223658499925, "flos": 22344936209280.0, "grad_norm": 2.2559786400364317, "language_loss": 0.72769004, "learning_rate": 3.055848260252823e-06, "loss": 0.74881119, "num_input_tokens_seen": 115435840, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.53515625, "step": 5373, "time_per_iteration": 2.39790415763855 }, { "auxiliary_loss_clip": 0.01078354, "auxiliary_loss_mlp": 0.0103161, "balance_loss_clip": 1.01712024, "balance_loss_mlp": 1.02540016, "epoch": 0.3231023598376672, "flos": 18477327749760.0, "grad_norm": 2.1306090181539434, "language_loss": 0.81241184, "learning_rate": 3.055527360109641e-06, "loss": 0.83351153, "num_input_tokens_seen": 115454210, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.53125, "step": 5374, "time_per_iteration": 2.3609631061553955 }, { "auxiliary_loss_clip": 0.01079095, "auxiliary_loss_mlp": 0.01032982, "balance_loss_clip": 1.01794958, "balance_loss_mlp": 1.02573276, "epoch": 0.3231624830903352, "flos": 27635080815360.0, "grad_norm": 2.8729695217465667, "language_loss": 0.87721264, "learning_rate": 3.0552064222974024e-06, "loss": 0.89833343, "num_input_tokens_seen": 115471785, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.53515625, "step": 5375, "time_per_iteration": 2.4294028282165527 }, { "auxiliary_loss_clip": 0.0108127, "auxiliary_loss_mlp": 0.01034899, "balance_loss_clip": 1.01782823, "balance_loss_mlp": 1.02376342, "epoch": 0.32322260634300315, "flos": 21725332577280.0, "grad_norm": 2.5660122403635124, "language_loss": 0.76166165, "learning_rate": 3.054885446827561e-06, "loss": 0.78282332, "num_input_tokens_seen": 115491405, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.57421875, "step": 5376, "time_per_iteration": 2.383476972579956 }, { "auxiliary_loss_clip": 0.010763, "auxiliary_loss_mlp": 0.0102696, "balance_loss_clip": 1.01236224, "balance_loss_mlp": 1.02472198, "epoch": 0.3232827295956711, "flos": 22636565729280.0, "grad_norm": 1.8001887070961868, "language_loss": 0.6709525, "learning_rate": 3.0545644337115716e-06, "loss": 0.69198507, "num_input_tokens_seen": 115511555, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.515625, "step": 5377, "time_per_iteration": 2.4060797691345215 }, { "auxiliary_loss_clip": 0.01079296, "auxiliary_loss_mlp": 0.01040704, "balance_loss_clip": 1.02427065, "balance_loss_mlp": 1.02596271, "epoch": 0.3233428528483391, "flos": 26321403888000.0, "grad_norm": 1.3927412936248764, "language_loss": 0.72254539, "learning_rate": 3.0542433829608902e-06, "loss": 0.74374539, "num_input_tokens_seen": 115532860, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.53125, "step": 5378, "time_per_iteration": 2.4265549182891846 }, { "auxiliary_loss_clip": 0.01077624, "auxiliary_loss_mlp": 0.01028484, "balance_loss_clip": 1.01256919, "balance_loss_mlp": 1.02334094, "epoch": 0.32340297610100704, "flos": 28583950279680.0, "grad_norm": 2.592730588230886, "language_loss": 0.81970894, "learning_rate": 3.0539222945869742e-06, "loss": 0.84077007, "num_input_tokens_seen": 115553850, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.54296875, "step": 5379, "time_per_iteration": 2.519977569580078 }, { "auxiliary_loss_clip": 0.0108063, "auxiliary_loss_mlp": 0.01029433, "balance_loss_clip": 1.01434147, "balance_loss_mlp": 1.02522683, "epoch": 0.323463099353675, "flos": 30772480855680.0, "grad_norm": 2.5958444129036886, "language_loss": 0.78698713, "learning_rate": 3.0536011686012827e-06, "loss": 0.80808771, "num_input_tokens_seen": 115575530, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.5546875, "step": 5380, "time_per_iteration": 2.4635374546051025 }, { "auxiliary_loss_clip": 0.01080093, "auxiliary_loss_mlp": 0.01029063, "balance_loss_clip": 1.014359, "balance_loss_mlp": 1.02611732, "epoch": 0.32352322260634303, "flos": 25227435346560.0, "grad_norm": 1.7120370289719287, "language_loss": 0.76934105, "learning_rate": 3.0532800050152752e-06, "loss": 0.79043263, "num_input_tokens_seen": 115594885, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.5390625, "step": 5381, "time_per_iteration": 2.428349494934082 }, { "auxiliary_loss_clip": 0.01075395, "auxiliary_loss_mlp": 0.01028506, "balance_loss_clip": 1.01452243, "balance_loss_mlp": 1.02448523, "epoch": 0.323583345859011, "flos": 23329382215680.0, "grad_norm": 1.6992395955451058, "language_loss": 0.71832007, "learning_rate": 3.052958803840414e-06, "loss": 0.73935908, "num_input_tokens_seen": 115614080, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.5078125, "step": 5382, "time_per_iteration": 2.3826966285705566 }, { "auxiliary_loss_clip": 0.01080542, "auxiliary_loss_mlp": 0.01038824, "balance_loss_clip": 1.02308822, "balance_loss_mlp": 1.02441061, "epoch": 0.32364346911167896, "flos": 26206470092160.0, "grad_norm": 3.099905045241218, "language_loss": 0.70067793, "learning_rate": 3.0526375650881617e-06, "loss": 0.72187161, "num_input_tokens_seen": 115632820, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.5625, "step": 5383, "time_per_iteration": 2.48154354095459 }, { "auxiliary_loss_clip": 0.01075978, "auxiliary_loss_mlp": 0.01028604, "balance_loss_clip": 1.01544309, "balance_loss_mlp": 1.02491498, "epoch": 0.3237035923643469, "flos": 23694643526400.0, "grad_norm": 2.2161622735116917, "language_loss": 0.78149533, "learning_rate": 3.0523162887699824e-06, "loss": 0.8025412, "num_input_tokens_seen": 115652860, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.51171875, "step": 5384, "time_per_iteration": 2.3936946392059326 }, { "auxiliary_loss_clip": 0.01082099, "auxiliary_loss_mlp": 0.0103474, "balance_loss_clip": 1.01907587, "balance_loss_mlp": 1.02628994, "epoch": 0.3237637156170149, "flos": 14427856684800.0, "grad_norm": 2.306999604704847, "language_loss": 0.75232095, "learning_rate": 3.051994974897342e-06, "loss": 0.77348936, "num_input_tokens_seen": 115670940, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.55859375, "step": 5385, "time_per_iteration": 2.379140853881836 }, { "auxiliary_loss_clip": 0.01078455, "auxiliary_loss_mlp": 0.01033392, "balance_loss_clip": 1.01775217, "balance_loss_mlp": 1.02491426, "epoch": 0.32382383886968286, "flos": 31061736403200.0, "grad_norm": 2.0634454545759655, "language_loss": 0.71936297, "learning_rate": 3.051673623481706e-06, "loss": 0.7404815, "num_input_tokens_seen": 115691155, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.53515625, "step": 5386, "time_per_iteration": 2.506383180618286 }, { "auxiliary_loss_clip": 0.01079334, "auxiliary_loss_mlp": 0.01030879, "balance_loss_clip": 1.01449919, "balance_loss_mlp": 1.02353239, "epoch": 0.3238839621223508, "flos": 23255855159040.0, "grad_norm": 1.787912039359255, "language_loss": 0.94587326, "learning_rate": 3.0513522345345446e-06, "loss": 0.96697545, "num_input_tokens_seen": 115710340, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.55859375, "step": 5387, "time_per_iteration": 2.4188995361328125 }, { "auxiliary_loss_clip": 0.01080641, "auxiliary_loss_mlp": 0.01034944, "balance_loss_clip": 1.01856422, "balance_loss_mlp": 1.02532244, "epoch": 0.3239440853750188, "flos": 22963597234560.0, "grad_norm": 3.300937406522664, "language_loss": 0.77612454, "learning_rate": 3.0510308080673256e-06, "loss": 0.79728043, "num_input_tokens_seen": 115726745, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.5546875, "step": 5388, "time_per_iteration": 3.7762293815612793 }, { "auxiliary_loss_clip": 0.01080741, "auxiliary_loss_mlp": 0.01031123, "balance_loss_clip": 1.01455331, "balance_loss_mlp": 1.02494729, "epoch": 0.32400420862768675, "flos": 36245151383040.0, "grad_norm": 1.896369953650391, "language_loss": 0.71499395, "learning_rate": 3.0507093440915214e-06, "loss": 0.73611259, "num_input_tokens_seen": 115749385, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.55859375, "step": 5389, "time_per_iteration": 2.5134568214416504 }, { "auxiliary_loss_clip": 0.01078189, "auxiliary_loss_mlp": 0.01033765, "balance_loss_clip": 1.0176239, "balance_loss_mlp": 1.02494574, "epoch": 0.3240643318803547, "flos": 21615426017280.0, "grad_norm": 2.2665248512681244, "language_loss": 0.80806518, "learning_rate": 3.0503878426186028e-06, "loss": 0.82918477, "num_input_tokens_seen": 115768105, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.53125, "step": 5390, "time_per_iteration": 2.3800621032714844 }, { "auxiliary_loss_clip": 0.01081436, "auxiliary_loss_mlp": 0.01044436, "balance_loss_clip": 1.02858067, "balance_loss_mlp": 1.02740049, "epoch": 0.3241244551330227, "flos": 23294468989440.0, "grad_norm": 8.930226657265974, "language_loss": 0.72171915, "learning_rate": 3.050066303660044e-06, "loss": 0.74297786, "num_input_tokens_seen": 115787340, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.5390625, "step": 5391, "time_per_iteration": 2.418109178543091 }, { "auxiliary_loss_clip": 0.01076605, "auxiliary_loss_mlp": 0.01032707, "balance_loss_clip": 1.01793671, "balance_loss_mlp": 1.02426147, "epoch": 0.32418457838569065, "flos": 14096461259520.0, "grad_norm": 2.867852821443578, "language_loss": 0.77113712, "learning_rate": 3.0497447272273203e-06, "loss": 0.79223019, "num_input_tokens_seen": 115805565, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.5234375, "step": 5392, "time_per_iteration": 3.818276882171631 }, { "auxiliary_loss_clip": 0.01082108, "auxiliary_loss_mlp": 0.01033949, "balance_loss_clip": 1.01781976, "balance_loss_mlp": 1.02654922, "epoch": 0.3242447016383586, "flos": 29751376055040.0, "grad_norm": 1.8130150819251067, "language_loss": 0.62515903, "learning_rate": 3.049423113331907e-06, "loss": 0.64631963, "num_input_tokens_seen": 115826725, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.5546875, "step": 5393, "time_per_iteration": 3.904017210006714 }, { "auxiliary_loss_clip": 0.01078697, "auxiliary_loss_mlp": 0.01034742, "balance_loss_clip": 1.01972163, "balance_loss_mlp": 1.02516043, "epoch": 0.3243048248910266, "flos": 24350102991360.0, "grad_norm": 1.608725193401751, "language_loss": 0.82639152, "learning_rate": 3.049101461985283e-06, "loss": 0.84752595, "num_input_tokens_seen": 115846955, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.53515625, "step": 5394, "time_per_iteration": 2.436965227127075 }, { "auxiliary_loss_clip": 0.0107615, "auxiliary_loss_mlp": 0.01036876, "balance_loss_clip": 1.02332175, "balance_loss_mlp": 1.02509201, "epoch": 0.3243649481436946, "flos": 24351883470720.0, "grad_norm": 2.9218368106545247, "language_loss": 0.81923747, "learning_rate": 3.048779773198926e-06, "loss": 0.84036779, "num_input_tokens_seen": 115865975, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.51171875, "step": 5395, "time_per_iteration": 2.4302804470062256 }, { "auxiliary_loss_clip": 0.01077724, "auxiliary_loss_mlp": 0.01031875, "balance_loss_clip": 1.01837444, "balance_loss_mlp": 1.02696395, "epoch": 0.32442507139636256, "flos": 22924250265600.0, "grad_norm": 1.737781338974741, "language_loss": 0.83581054, "learning_rate": 3.048458046984317e-06, "loss": 0.85690653, "num_input_tokens_seen": 115884950, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.5078125, "step": 5396, "time_per_iteration": 2.405505895614624 }, { "auxiliary_loss_clip": 0.01082508, "auxiliary_loss_mlp": 0.01036973, "balance_loss_clip": 1.02117813, "balance_loss_mlp": 1.02619088, "epoch": 0.32448519464903053, "flos": 22199103993600.0, "grad_norm": 1.8534963832756508, "language_loss": 0.75302124, "learning_rate": 3.0481362833529363e-06, "loss": 0.77421606, "num_input_tokens_seen": 115904170, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.5625, "step": 5397, "time_per_iteration": 3.7684366703033447 }, { "auxiliary_loss_clip": 0.01078287, "auxiliary_loss_mlp": 0.01029348, "balance_loss_clip": 1.01461387, "balance_loss_mlp": 1.02486801, "epoch": 0.3245453179016985, "flos": 18837596736000.0, "grad_norm": 2.2254877229769248, "language_loss": 0.66831249, "learning_rate": 3.0478144823162686e-06, "loss": 0.68938887, "num_input_tokens_seen": 115919255, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.53515625, "step": 5398, "time_per_iteration": 2.3500254154205322 }, { "auxiliary_loss_clip": 0.01077051, "auxiliary_loss_mlp": 0.01023777, "balance_loss_clip": 1.00887644, "balance_loss_mlp": 1.02394485, "epoch": 0.32460544115436646, "flos": 21177335877120.0, "grad_norm": 1.4097211460049397, "language_loss": 0.72865582, "learning_rate": 3.0474926438857976e-06, "loss": 0.74966413, "num_input_tokens_seen": 115938535, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.53125, "step": 5399, "time_per_iteration": 2.4297893047332764 }, { "auxiliary_loss_clip": 0.01079311, "auxiliary_loss_mlp": 0.0103186, "balance_loss_clip": 1.01638675, "balance_loss_mlp": 1.02466607, "epoch": 0.3246655644070344, "flos": 21980058923520.0, "grad_norm": 3.2654722497656126, "language_loss": 0.713521, "learning_rate": 3.047170768073008e-06, "loss": 0.73463267, "num_input_tokens_seen": 115955005, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.546875, "step": 5400, "time_per_iteration": 2.387359380722046 }, { "auxiliary_loss_clip": 0.01081027, "auxiliary_loss_mlp": 0.01035176, "balance_loss_clip": 1.02026868, "balance_loss_mlp": 1.025769, "epoch": 0.3247256876597024, "flos": 32158393119360.0, "grad_norm": 2.1169328533256455, "language_loss": 0.79567647, "learning_rate": 3.046848854889388e-06, "loss": 0.8168385, "num_input_tokens_seen": 115975305, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.55078125, "step": 5401, "time_per_iteration": 2.4848508834838867 }, { "auxiliary_loss_clip": 0.01079866, "auxiliary_loss_mlp": 0.01038272, "balance_loss_clip": 1.02216136, "balance_loss_mlp": 1.02576995, "epoch": 0.32478581091237035, "flos": 20996450789760.0, "grad_norm": 1.5743781154015057, "language_loss": 0.87424928, "learning_rate": 3.0465269043464243e-06, "loss": 0.89543062, "num_input_tokens_seen": 115994810, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.54296875, "step": 5402, "time_per_iteration": 2.3826820850372314 }, { "auxiliary_loss_clip": 0.01074204, "auxiliary_loss_mlp": 0.01032167, "balance_loss_clip": 1.01690769, "balance_loss_mlp": 1.02333927, "epoch": 0.3248459341650383, "flos": 17924199079680.0, "grad_norm": 3.434140366835923, "language_loss": 0.84468889, "learning_rate": 3.0462049164556082e-06, "loss": 0.86575258, "num_input_tokens_seen": 116011095, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.5078125, "step": 5403, "time_per_iteration": 2.3555710315704346 }, { "auxiliary_loss_clip": 0.01081101, "auxiliary_loss_mlp": 0.01031497, "balance_loss_clip": 1.01775241, "balance_loss_mlp": 1.02975714, "epoch": 0.3249060574177063, "flos": 24534444303360.0, "grad_norm": 2.19252206773407, "language_loss": 0.86714506, "learning_rate": 3.0458828912284293e-06, "loss": 0.88827109, "num_input_tokens_seen": 116028805, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.51171875, "step": 5404, "time_per_iteration": 2.424011707305908 }, { "auxiliary_loss_clip": 0.01078396, "auxiliary_loss_mlp": 0.01031567, "balance_loss_clip": 1.0167017, "balance_loss_mlp": 1.0257628, "epoch": 0.32496618067037425, "flos": 25993569421440.0, "grad_norm": 1.5546801839984006, "language_loss": 0.72701812, "learning_rate": 3.0455608286763803e-06, "loss": 0.74811774, "num_input_tokens_seen": 116047765, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.52734375, "step": 5405, "time_per_iteration": 2.4844679832458496 }, { "auxiliary_loss_clip": 0.01077176, "auxiliary_loss_mlp": 0.0103331, "balance_loss_clip": 1.0177412, "balance_loss_mlp": 1.0244993, "epoch": 0.3250263039230422, "flos": 19572727656960.0, "grad_norm": 1.641493441548095, "language_loss": 0.82919037, "learning_rate": 3.045238728810955e-06, "loss": 0.85029519, "num_input_tokens_seen": 116068385, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.52734375, "step": 5406, "time_per_iteration": 2.4126479625701904 }, { "auxiliary_loss_clip": 0.0107731, "auxiliary_loss_mlp": 0.0103144, "balance_loss_clip": 1.01671171, "balance_loss_mlp": 1.02566195, "epoch": 0.3250864271757102, "flos": 16762708235520.0, "grad_norm": 1.7530919370193365, "language_loss": 0.87812674, "learning_rate": 3.0449165916436485e-06, "loss": 0.89921427, "num_input_tokens_seen": 116085350, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.515625, "step": 5407, "time_per_iteration": 2.373321533203125 }, { "auxiliary_loss_clip": 0.0107742, "auxiliary_loss_mlp": 0.01030407, "balance_loss_clip": 1.01505244, "balance_loss_mlp": 1.02433276, "epoch": 0.3251465504283782, "flos": 27818200229760.0, "grad_norm": 1.6656429373095116, "language_loss": 0.69662368, "learning_rate": 3.044594417185956e-06, "loss": 0.71770191, "num_input_tokens_seen": 116107560, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.53125, "step": 5408, "time_per_iteration": 2.4401321411132812 }, { "auxiliary_loss_clip": 0.01083446, "auxiliary_loss_mlp": 0.0102933, "balance_loss_clip": 1.01321888, "balance_loss_mlp": 1.02557099, "epoch": 0.32520667368104617, "flos": 19062122889600.0, "grad_norm": 1.7759011843616177, "language_loss": 0.77459997, "learning_rate": 3.044272205449376e-06, "loss": 0.79572773, "num_input_tokens_seen": 116125980, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.578125, "step": 5409, "time_per_iteration": 2.3784339427948 }, { "auxiliary_loss_clip": 0.0107907, "auxiliary_loss_mlp": 0.01037077, "balance_loss_clip": 1.02132392, "balance_loss_mlp": 1.02310836, "epoch": 0.32526679693371413, "flos": 29381017685760.0, "grad_norm": 1.7387065242879542, "language_loss": 0.83244413, "learning_rate": 3.0439499564454073e-06, "loss": 0.85360563, "num_input_tokens_seen": 116146530, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.5625, "step": 5410, "time_per_iteration": 2.433255910873413 }, { "auxiliary_loss_clip": 0.01075946, "auxiliary_loss_mlp": 0.01034512, "balance_loss_clip": 1.01973033, "balance_loss_mlp": 1.02407193, "epoch": 0.3253269201863821, "flos": 20703459726720.0, "grad_norm": 1.541437800142103, "language_loss": 0.7082814, "learning_rate": 3.04362767018555e-06, "loss": 0.72938603, "num_input_tokens_seen": 116165695, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.515625, "step": 5411, "time_per_iteration": 2.3710265159606934 }, { "auxiliary_loss_clip": 0.01079104, "auxiliary_loss_mlp": 0.01033374, "balance_loss_clip": 1.01905727, "balance_loss_mlp": 1.0264312, "epoch": 0.32538704343905006, "flos": 29092914213120.0, "grad_norm": 1.4822385106475402, "language_loss": 0.82966936, "learning_rate": 3.0433053466813053e-06, "loss": 0.8507942, "num_input_tokens_seen": 116185375, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.52734375, "step": 5412, "time_per_iteration": 2.4416842460632324 }, { "auxiliary_loss_clip": 0.01078737, "auxiliary_loss_mlp": 0.01031884, "balance_loss_clip": 1.01546848, "balance_loss_mlp": 1.02402878, "epoch": 0.325447166691718, "flos": 24675109637760.0, "grad_norm": 1.7157391316373405, "language_loss": 0.80915964, "learning_rate": 3.042982985944177e-06, "loss": 0.83026582, "num_input_tokens_seen": 116204335, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.546875, "step": 5413, "time_per_iteration": 2.4012088775634766 }, { "auxiliary_loss_clip": 0.01077817, "auxiliary_loss_mlp": 0.01031647, "balance_loss_clip": 1.01567328, "balance_loss_mlp": 1.0245018, "epoch": 0.325507289944386, "flos": 21542073517440.0, "grad_norm": 1.6593193477589114, "language_loss": 0.76812875, "learning_rate": 3.0426605879856685e-06, "loss": 0.78922343, "num_input_tokens_seen": 116222840, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.53515625, "step": 5414, "time_per_iteration": 2.3885934352874756 }, { "auxiliary_loss_clip": 0.01012807, "auxiliary_loss_mlp": 0.01013497, "balance_loss_clip": 1.01207817, "balance_loss_mlp": 1.00135851, "epoch": 0.32556741319705396, "flos": 71515527206400.0, "grad_norm": 0.9144087985326989, "language_loss": 0.63917202, "learning_rate": 3.0423381528172864e-06, "loss": 0.65943509, "num_input_tokens_seen": 116274940, "router_z_loss_clip": 0.01416016, "router_z_loss_mlp": 0.11425781, "step": 5415, "time_per_iteration": 2.9493916034698486 }, { "auxiliary_loss_clip": 0.0107686, "auxiliary_loss_mlp": 0.01028036, "balance_loss_clip": 1.01292014, "balance_loss_mlp": 1.02416492, "epoch": 0.3256275364497219, "flos": 23731302320640.0, "grad_norm": 1.7554377611460208, "language_loss": 0.74007642, "learning_rate": 3.042015680450536e-06, "loss": 0.76112533, "num_input_tokens_seen": 116297300, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.52734375, "step": 5416, "time_per_iteration": 2.4102070331573486 }, { "auxiliary_loss_clip": 0.01013101, "auxiliary_loss_mlp": 0.01002505, "balance_loss_clip": 1.00118756, "balance_loss_mlp": 1.00168014, "epoch": 0.3256876597023899, "flos": 67286043838080.0, "grad_norm": 0.783166779440946, "language_loss": 0.57991099, "learning_rate": 3.041693170896926e-06, "loss": 0.60006702, "num_input_tokens_seen": 116362370, "router_z_loss_clip": 0.01318359, "router_z_loss_mlp": 0.11425781, "step": 5417, "time_per_iteration": 3.112537384033203 }, { "auxiliary_loss_clip": 0.01013247, "auxiliary_loss_mlp": 0.00999626, "balance_loss_clip": 0.99820149, "balance_loss_mlp": 1.0018003, "epoch": 0.32574778295505785, "flos": 71278605653760.0, "grad_norm": 0.888885935789413, "language_loss": 0.63371241, "learning_rate": 3.0413706241679674e-06, "loss": 0.65384114, "num_input_tokens_seen": 116430365, "router_z_loss_clip": 0.01422119, "router_z_loss_mlp": 0.11425781, "step": 5418, "time_per_iteration": 3.155064344406128 }, { "auxiliary_loss_clip": 0.01076001, "auxiliary_loss_mlp": 0.01035298, "balance_loss_clip": 1.02075422, "balance_loss_mlp": 1.02432656, "epoch": 0.3258079062077258, "flos": 20775345949440.0, "grad_norm": 2.9823348293798655, "language_loss": 0.69778025, "learning_rate": 3.041048040275169e-06, "loss": 0.71889329, "num_input_tokens_seen": 116447525, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.515625, "step": 5419, "time_per_iteration": 2.3773465156555176 }, { "auxiliary_loss_clip": 0.01078726, "auxiliary_loss_mlp": 0.01031779, "balance_loss_clip": 1.01542389, "balance_loss_mlp": 1.02577317, "epoch": 0.3258680294603938, "flos": 22234401244800.0, "grad_norm": 1.9098820078835552, "language_loss": 0.77784669, "learning_rate": 3.0407254192300444e-06, "loss": 0.79895169, "num_input_tokens_seen": 116466310, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.53125, "step": 5420, "time_per_iteration": 2.4065260887145996 }, { "auxiliary_loss_clip": 0.01079892, "auxiliary_loss_mlp": 0.01034915, "balance_loss_clip": 1.01897705, "balance_loss_mlp": 1.02523708, "epoch": 0.3259281527130618, "flos": 26978748566400.0, "grad_norm": 1.5051119461954516, "language_loss": 0.79559088, "learning_rate": 3.040402761044107e-06, "loss": 0.81673896, "num_input_tokens_seen": 116487825, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.546875, "step": 5421, "time_per_iteration": 2.4390525817871094 }, { "auxiliary_loss_clip": 0.01074568, "auxiliary_loss_mlp": 0.01032796, "balance_loss_clip": 1.01919973, "balance_loss_mlp": 1.0242002, "epoch": 0.32598827596572977, "flos": 26213033427840.0, "grad_norm": 2.4132741181899204, "language_loss": 0.75062263, "learning_rate": 3.040080065728871e-06, "loss": 0.77169627, "num_input_tokens_seen": 116509950, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.50390625, "step": 5422, "time_per_iteration": 2.4640307426452637 }, { "auxiliary_loss_clip": 0.01079739, "auxiliary_loss_mlp": 0.01036498, "balance_loss_clip": 1.02117372, "balance_loss_mlp": 1.02581573, "epoch": 0.32604839921839773, "flos": 17638783781760.0, "grad_norm": 2.2617686532229517, "language_loss": 0.63190514, "learning_rate": 3.0397573332958527e-06, "loss": 0.65306753, "num_input_tokens_seen": 116527695, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.5390625, "step": 5423, "time_per_iteration": 2.3707315921783447 }, { "auxiliary_loss_clip": 0.01072296, "auxiliary_loss_mlp": 0.01030382, "balance_loss_clip": 1.01644063, "balance_loss_mlp": 1.0233438, "epoch": 0.3261085224710657, "flos": 23621605228800.0, "grad_norm": 1.660012476943711, "language_loss": 0.74586529, "learning_rate": 3.039434563756569e-06, "loss": 0.76689208, "num_input_tokens_seen": 116547800, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.48828125, "step": 5424, "time_per_iteration": 2.4088220596313477 }, { "auxiliary_loss_clip": 0.01073955, "auxiliary_loss_mlp": 0.01028688, "balance_loss_clip": 1.01474011, "balance_loss_mlp": 1.02412653, "epoch": 0.32616864572373366, "flos": 23259276472320.0, "grad_norm": 1.627510589085908, "language_loss": 0.77298176, "learning_rate": 3.0391117571225407e-06, "loss": 0.7940082, "num_input_tokens_seen": 116568460, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.49804688, "step": 5425, "time_per_iteration": 2.4299569129943848 }, { "auxiliary_loss_clip": 0.0107992, "auxiliary_loss_mlp": 0.01032948, "balance_loss_clip": 1.01578176, "balance_loss_mlp": 1.02493644, "epoch": 0.32622876897640163, "flos": 25592242809600.0, "grad_norm": 2.4558171768548895, "language_loss": 0.7800011, "learning_rate": 3.0387889134052866e-06, "loss": 0.80112982, "num_input_tokens_seen": 116588705, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.546875, "step": 5426, "time_per_iteration": 2.4657649993896484 }, { "auxiliary_loss_clip": 0.01078852, "auxiliary_loss_mlp": 0.01029928, "balance_loss_clip": 1.01478291, "balance_loss_mlp": 1.02712727, "epoch": 0.3262888922290696, "flos": 22417904684160.0, "grad_norm": 1.6946984512612686, "language_loss": 0.74343133, "learning_rate": 3.0384660326163277e-06, "loss": 0.76451916, "num_input_tokens_seen": 116608845, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.515625, "step": 5427, "time_per_iteration": 2.4272100925445557 }, { "auxiliary_loss_clip": 0.01079176, "auxiliary_loss_mlp": 0.01033608, "balance_loss_clip": 1.01759815, "balance_loss_mlp": 1.02467334, "epoch": 0.32634901548173756, "flos": 19717896556800.0, "grad_norm": 2.333646384732536, "language_loss": 0.79019922, "learning_rate": 3.0381431147671875e-06, "loss": 0.8113271, "num_input_tokens_seen": 116628145, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.546875, "step": 5428, "time_per_iteration": 3.756551504135132 }, { "auxiliary_loss_clip": 0.01075906, "auxiliary_loss_mlp": 0.01032761, "balance_loss_clip": 1.01808619, "balance_loss_mlp": 1.02349102, "epoch": 0.3264091387344055, "flos": 16142022351360.0, "grad_norm": 1.7366548579309573, "language_loss": 0.71382821, "learning_rate": 3.03782015986939e-06, "loss": 0.7349149, "num_input_tokens_seen": 116646920, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.5234375, "step": 5429, "time_per_iteration": 2.368297576904297 }, { "auxiliary_loss_clip": 0.01077668, "auxiliary_loss_mlp": 0.0103048, "balance_loss_clip": 1.01642549, "balance_loss_mlp": 1.02595758, "epoch": 0.3264692619870735, "flos": 16398145152000.0, "grad_norm": 1.7309756186465304, "language_loss": 0.78436255, "learning_rate": 3.037497167934461e-06, "loss": 0.805444, "num_input_tokens_seen": 116665100, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.515625, "step": 5430, "time_per_iteration": 2.3637843132019043 }, { "auxiliary_loss_clip": 0.01079765, "auxiliary_loss_mlp": 0.01033136, "balance_loss_clip": 1.01623201, "balance_loss_mlp": 1.02481902, "epoch": 0.32652938523974145, "flos": 22381245889920.0, "grad_norm": 2.344577465160989, "language_loss": 0.84277546, "learning_rate": 3.037174138973927e-06, "loss": 0.86390448, "num_input_tokens_seen": 116682205, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.55078125, "step": 5431, "time_per_iteration": 2.3848962783813477 }, { "auxiliary_loss_clip": 0.01074432, "auxiliary_loss_mlp": 0.01033338, "balance_loss_clip": 1.01854372, "balance_loss_mlp": 1.02247286, "epoch": 0.3265895084924094, "flos": 21906985714560.0, "grad_norm": 5.324207634185405, "language_loss": 0.70300651, "learning_rate": 3.0368510729993147e-06, "loss": 0.72408426, "num_input_tokens_seen": 116702575, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.51953125, "step": 5432, "time_per_iteration": 3.8911707401275635 }, { "auxiliary_loss_clip": 0.01074675, "auxiliary_loss_mlp": 0.01025699, "balance_loss_clip": 1.01156664, "balance_loss_mlp": 1.02224946, "epoch": 0.3266496317450774, "flos": 16066330790400.0, "grad_norm": 2.310135841221492, "language_loss": 0.84323114, "learning_rate": 3.0365279700221555e-06, "loss": 0.86423481, "num_input_tokens_seen": 116720885, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.5234375, "step": 5433, "time_per_iteration": 3.7029778957366943 }, { "auxiliary_loss_clip": 0.01075471, "auxiliary_loss_mlp": 0.01027421, "balance_loss_clip": 1.01315725, "balance_loss_mlp": 1.02445185, "epoch": 0.3267097549977454, "flos": 22527147928320.0, "grad_norm": 1.3751413669100931, "language_loss": 0.85843301, "learning_rate": 3.036204830053979e-06, "loss": 0.879462, "num_input_tokens_seen": 116740395, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.5078125, "step": 5434, "time_per_iteration": 2.393791437149048 }, { "auxiliary_loss_clip": 0.01079266, "auxiliary_loss_mlp": 0.01033873, "balance_loss_clip": 1.01808953, "balance_loss_mlp": 1.02544296, "epoch": 0.32676987825041337, "flos": 27269226011520.0, "grad_norm": 1.767099675996129, "language_loss": 0.87589449, "learning_rate": 3.035881653106318e-06, "loss": 0.89702582, "num_input_tokens_seen": 116758870, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.5390625, "step": 5435, "time_per_iteration": 2.4210550785064697 }, { "auxiliary_loss_clip": 0.01077102, "auxiliary_loss_mlp": 0.01033977, "balance_loss_clip": 1.01865852, "balance_loss_mlp": 1.02470124, "epoch": 0.32683000150308134, "flos": 11507511767040.0, "grad_norm": 2.413629176530171, "language_loss": 0.76559198, "learning_rate": 3.035558439190705e-06, "loss": 0.78670275, "num_input_tokens_seen": 116773440, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.5234375, "step": 5436, "time_per_iteration": 2.349705696105957 }, { "auxiliary_loss_clip": 0.01078114, "auxiliary_loss_mlp": 0.01029775, "balance_loss_clip": 1.01520216, "balance_loss_mlp": 1.0256958, "epoch": 0.3268901247557493, "flos": 25629006337920.0, "grad_norm": 1.5246901564698747, "language_loss": 0.71808475, "learning_rate": 3.0352351883186753e-06, "loss": 0.73916364, "num_input_tokens_seen": 116794375, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.5234375, "step": 5437, "time_per_iteration": 3.8887460231781006 }, { "auxiliary_loss_clip": 0.01076429, "auxiliary_loss_mlp": 0.01033303, "balance_loss_clip": 1.0166254, "balance_loss_mlp": 1.02238953, "epoch": 0.32695024800841727, "flos": 24859765152000.0, "grad_norm": 1.5498046810882387, "language_loss": 0.63579702, "learning_rate": 3.034911900501765e-06, "loss": 0.65689439, "num_input_tokens_seen": 116815095, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.5390625, "step": 5438, "time_per_iteration": 2.4118716716766357 }, { "auxiliary_loss_clip": 0.01077552, "auxiliary_loss_mlp": 0.01029749, "balance_loss_clip": 1.01378727, "balance_loss_mlp": 1.02547598, "epoch": 0.32701037126108523, "flos": 28838013310080.0, "grad_norm": 1.453865626869374, "language_loss": 0.74592376, "learning_rate": 3.0345885757515104e-06, "loss": 0.76699674, "num_input_tokens_seen": 116836630, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.51953125, "step": 5439, "time_per_iteration": 2.428367853164673 }, { "auxiliary_loss_clip": 0.01078362, "auxiliary_loss_mlp": 0.01033008, "balance_loss_clip": 1.0178982, "balance_loss_mlp": 1.0250299, "epoch": 0.3270704945137532, "flos": 27963822977280.0, "grad_norm": 1.851964283862922, "language_loss": 0.74615502, "learning_rate": 3.034265214079451e-06, "loss": 0.76726878, "num_input_tokens_seen": 116856880, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.53125, "step": 5440, "time_per_iteration": 2.430908203125 }, { "auxiliary_loss_clip": 0.01077425, "auxiliary_loss_mlp": 0.01038005, "balance_loss_clip": 1.02284217, "balance_loss_mlp": 1.0243032, "epoch": 0.32713061776642116, "flos": 23689721024640.0, "grad_norm": 1.9272560702716868, "language_loss": 0.84941757, "learning_rate": 3.0339418154971262e-06, "loss": 0.87057185, "num_input_tokens_seen": 116873770, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.53125, "step": 5441, "time_per_iteration": 2.3966825008392334 }, { "auxiliary_loss_clip": 0.01079412, "auxiliary_loss_mlp": 0.01034647, "balance_loss_clip": 1.01808846, "balance_loss_mlp": 1.02504659, "epoch": 0.3271907410190891, "flos": 22454528567040.0, "grad_norm": 2.0114420325569675, "language_loss": 0.86473727, "learning_rate": 3.0336183800160786e-06, "loss": 0.88587785, "num_input_tokens_seen": 116891225, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.54296875, "step": 5442, "time_per_iteration": 2.380145788192749 }, { "auxiliary_loss_clip": 0.01080376, "auxiliary_loss_mlp": 0.01033421, "balance_loss_clip": 1.01675582, "balance_loss_mlp": 1.02519119, "epoch": 0.3272508642717571, "flos": 22819021827840.0, "grad_norm": 1.5505837294884508, "language_loss": 0.77489972, "learning_rate": 3.033294907647849e-06, "loss": 0.79603767, "num_input_tokens_seen": 116912300, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.55078125, "step": 5443, "time_per_iteration": 2.395960807800293 }, { "auxiliary_loss_clip": 0.0107767, "auxiliary_loss_mlp": 0.01031808, "balance_loss_clip": 1.01604891, "balance_loss_mlp": 1.02239799, "epoch": 0.32731098752442506, "flos": 11800572652800.0, "grad_norm": 2.8015540557958403, "language_loss": 0.81702423, "learning_rate": 3.0329713984039824e-06, "loss": 0.83811897, "num_input_tokens_seen": 116929425, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.5546875, "step": 5444, "time_per_iteration": 2.348742723464966 }, { "auxiliary_loss_clip": 0.01077456, "auxiliary_loss_mlp": 0.01034156, "balance_loss_clip": 1.01850367, "balance_loss_mlp": 1.0237186, "epoch": 0.327371110777093, "flos": 21026860450560.0, "grad_norm": 2.0066738671434616, "language_loss": 0.58649683, "learning_rate": 3.032647852296024e-06, "loss": 0.60761285, "num_input_tokens_seen": 116948255, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.5390625, "step": 5445, "time_per_iteration": 2.4022276401519775 }, { "auxiliary_loss_clip": 0.01079605, "auxiliary_loss_mlp": 0.01035419, "balance_loss_clip": 1.01877713, "balance_loss_mlp": 1.0249486, "epoch": 0.327431234029761, "flos": 19061110460160.0, "grad_norm": 2.5663118672278142, "language_loss": 0.88257396, "learning_rate": 3.0323242693355195e-06, "loss": 0.90372419, "num_input_tokens_seen": 116964905, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.546875, "step": 5446, "time_per_iteration": 2.359149932861328 }, { "auxiliary_loss_clip": 0.01083825, "auxiliary_loss_mlp": 0.01038657, "balance_loss_clip": 1.02052522, "balance_loss_mlp": 1.02630043, "epoch": 0.32749135728242895, "flos": 25848016496640.0, "grad_norm": 2.8254414686427483, "language_loss": 0.79008245, "learning_rate": 3.0320006495340175e-06, "loss": 0.81130731, "num_input_tokens_seen": 116983650, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.578125, "step": 5447, "time_per_iteration": 2.4090781211853027 }, { "auxiliary_loss_clip": 0.01079066, "auxiliary_loss_mlp": 0.01033409, "balance_loss_clip": 1.01882958, "balance_loss_mlp": 1.02469659, "epoch": 0.327551480535097, "flos": 20119502459520.0, "grad_norm": 2.4382832938527472, "language_loss": 0.73265076, "learning_rate": 3.0316769929030672e-06, "loss": 0.75377548, "num_input_tokens_seen": 117003265, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.54296875, "step": 5448, "time_per_iteration": 2.3767454624176025 }, { "auxiliary_loss_clip": 0.01077026, "auxiliary_loss_mlp": 0.01033209, "balance_loss_clip": 1.01808095, "balance_loss_mlp": 1.02421916, "epoch": 0.32761160378776494, "flos": 28802297122560.0, "grad_norm": 2.7915555525855007, "language_loss": 0.66764009, "learning_rate": 3.0313532994542185e-06, "loss": 0.6887424, "num_input_tokens_seen": 117025370, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.52734375, "step": 5449, "time_per_iteration": 2.4324452877044678 }, { "auxiliary_loss_clip": 0.01075557, "auxiliary_loss_mlp": 0.0102858, "balance_loss_clip": 1.01382208, "balance_loss_mlp": 1.02304649, "epoch": 0.3276717270404329, "flos": 26936713422720.0, "grad_norm": 1.4328077652786053, "language_loss": 0.65584016, "learning_rate": 3.0310295691990234e-06, "loss": 0.67688155, "num_input_tokens_seen": 117044350, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.52734375, "step": 5450, "time_per_iteration": 2.4380178451538086 }, { "auxiliary_loss_clip": 0.01079037, "auxiliary_loss_mlp": 0.01031527, "balance_loss_clip": 1.01591063, "balance_loss_mlp": 1.02471006, "epoch": 0.32773185029310087, "flos": 25337237172480.0, "grad_norm": 1.8563300319989955, "language_loss": 0.77256566, "learning_rate": 3.030705802149035e-06, "loss": 0.79367125, "num_input_tokens_seen": 117064450, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.54296875, "step": 5451, "time_per_iteration": 2.4091036319732666 }, { "auxiliary_loss_clip": 0.01079516, "auxiliary_loss_mlp": 0.01034288, "balance_loss_clip": 1.01773548, "balance_loss_mlp": 1.02491164, "epoch": 0.32779197354576883, "flos": 26390636847360.0, "grad_norm": 2.9295776544066325, "language_loss": 0.70404297, "learning_rate": 3.030381998315808e-06, "loss": 0.72518098, "num_input_tokens_seen": 117083060, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.546875, "step": 5452, "time_per_iteration": 2.4223482608795166 }, { "auxiliary_loss_clip": 0.01075831, "auxiliary_loss_mlp": 0.01029826, "balance_loss_clip": 1.01447225, "balance_loss_mlp": 1.02386236, "epoch": 0.3278520967984368, "flos": 24898239336960.0, "grad_norm": 1.5042685434288139, "language_loss": 0.78481078, "learning_rate": 3.030058157710899e-06, "loss": 0.80586743, "num_input_tokens_seen": 117101860, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.51953125, "step": 5453, "time_per_iteration": 2.4038357734680176 }, { "auxiliary_loss_clip": 0.01078186, "auxiliary_loss_mlp": 0.01028418, "balance_loss_clip": 1.01371348, "balance_loss_mlp": 1.02466023, "epoch": 0.32791222005110476, "flos": 29751690257280.0, "grad_norm": 2.8454525940046937, "language_loss": 0.75492507, "learning_rate": 3.0297342803458624e-06, "loss": 0.77599108, "num_input_tokens_seen": 117123100, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.53515625, "step": 5454, "time_per_iteration": 2.445586681365967 }, { "auxiliary_loss_clip": 0.01074595, "auxiliary_loss_mlp": 0.01030052, "balance_loss_clip": 1.01624203, "balance_loss_mlp": 1.02453721, "epoch": 0.32797234330377273, "flos": 16507144016640.0, "grad_norm": 1.7433348099328165, "language_loss": 0.76670611, "learning_rate": 3.029410366232259e-06, "loss": 0.78775251, "num_input_tokens_seen": 117140515, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.5, "step": 5455, "time_per_iteration": 2.3452672958374023 }, { "auxiliary_loss_clip": 0.01080187, "auxiliary_loss_mlp": 0.01034238, "balance_loss_clip": 1.01670265, "balance_loss_mlp": 1.02370977, "epoch": 0.3280324665564407, "flos": 26576723727360.0, "grad_norm": 1.6577279543472647, "language_loss": 0.73814428, "learning_rate": 3.0290864153816467e-06, "loss": 0.75928849, "num_input_tokens_seen": 117161485, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.5625, "step": 5456, "time_per_iteration": 2.44221830368042 }, { "auxiliary_loss_clip": 0.01080212, "auxiliary_loss_mlp": 0.01034765, "balance_loss_clip": 1.01813531, "balance_loss_mlp": 1.02497685, "epoch": 0.32809258980910866, "flos": 22928858565120.0, "grad_norm": 1.3973542595297122, "language_loss": 0.78002566, "learning_rate": 3.028762427805588e-06, "loss": 0.80117542, "num_input_tokens_seen": 117181870, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.55078125, "step": 5457, "time_per_iteration": 2.3832404613494873 }, { "auxiliary_loss_clip": 0.01078947, "auxiliary_loss_mlp": 0.0103513, "balance_loss_clip": 1.01989484, "balance_loss_mlp": 1.0238179, "epoch": 0.3281527130617766, "flos": 22782747058560.0, "grad_norm": 2.2032463858877755, "language_loss": 0.78798318, "learning_rate": 3.028438403515645e-06, "loss": 0.80912393, "num_input_tokens_seen": 117201380, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.55078125, "step": 5458, "time_per_iteration": 2.4138641357421875 }, { "auxiliary_loss_clip": 0.01077177, "auxiliary_loss_mlp": 0.0102955, "balance_loss_clip": 1.01431465, "balance_loss_mlp": 1.02429223, "epoch": 0.3282128363144446, "flos": 21249641036160.0, "grad_norm": 1.8622662939696581, "language_loss": 0.7289722, "learning_rate": 3.0281143425233795e-06, "loss": 0.7500394, "num_input_tokens_seen": 117221040, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.52734375, "step": 5459, "time_per_iteration": 2.399421215057373 }, { "auxiliary_loss_clip": 0.01080956, "auxiliary_loss_mlp": 0.01033907, "balance_loss_clip": 1.01768255, "balance_loss_mlp": 1.02518368, "epoch": 0.32827295956711255, "flos": 30841853460480.0, "grad_norm": 1.7544502809181561, "language_loss": 0.84002161, "learning_rate": 3.02779024484036e-06, "loss": 0.86117017, "num_input_tokens_seen": 117241395, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.55859375, "step": 5460, "time_per_iteration": 2.464229106903076 }, { "auxiliary_loss_clip": 0.01076967, "auxiliary_loss_mlp": 0.01030284, "balance_loss_clip": 1.01539469, "balance_loss_mlp": 1.02279675, "epoch": 0.3283330828197806, "flos": 25914002699520.0, "grad_norm": 1.7437057590676457, "language_loss": 0.76681101, "learning_rate": 3.0274661104781483e-06, "loss": 0.78788352, "num_input_tokens_seen": 117259340, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.5390625, "step": 5461, "time_per_iteration": 2.402656078338623 }, { "auxiliary_loss_clip": 0.0107789, "auxiliary_loss_mlp": 0.01031778, "balance_loss_clip": 1.01458812, "balance_loss_mlp": 1.02455664, "epoch": 0.32839320607244854, "flos": 38580526604160.0, "grad_norm": 1.9193213046648052, "language_loss": 0.63180983, "learning_rate": 3.027141939448315e-06, "loss": 0.65290648, "num_input_tokens_seen": 117282375, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.53125, "step": 5462, "time_per_iteration": 2.5308499336242676 }, { "auxiliary_loss_clip": 0.0107776, "auxiliary_loss_mlp": 0.01027519, "balance_loss_clip": 1.0127486, "balance_loss_mlp": 1.02419543, "epoch": 0.3284533293251165, "flos": 26649692202240.0, "grad_norm": 1.6113829294151034, "language_loss": 0.77892303, "learning_rate": 3.0268177317624275e-06, "loss": 0.79997581, "num_input_tokens_seen": 117303830, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.5390625, "step": 5463, "time_per_iteration": 2.426086664199829 }, { "auxiliary_loss_clip": 0.01077726, "auxiliary_loss_mlp": 0.0103632, "balance_loss_clip": 1.02021456, "balance_loss_mlp": 1.02386844, "epoch": 0.32851345257778447, "flos": 15303268915200.0, "grad_norm": 2.125496897238556, "language_loss": 0.69328785, "learning_rate": 3.0264934874320566e-06, "loss": 0.71442831, "num_input_tokens_seen": 117320665, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.5390625, "step": 5464, "time_per_iteration": 2.3595173358917236 }, { "auxiliary_loss_clip": 0.01077268, "auxiliary_loss_mlp": 0.01035001, "balance_loss_clip": 1.01927733, "balance_loss_mlp": 1.02579987, "epoch": 0.32857357583045244, "flos": 23512606364160.0, "grad_norm": 2.222365487172947, "language_loss": 0.72405088, "learning_rate": 3.026169206468774e-06, "loss": 0.74517351, "num_input_tokens_seen": 117339795, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.515625, "step": 5465, "time_per_iteration": 2.3878488540649414 }, { "auxiliary_loss_clip": 0.0107929, "auxiliary_loss_mlp": 0.01027737, "balance_loss_clip": 1.01265681, "balance_loss_mlp": 1.02633595, "epoch": 0.3286336990831204, "flos": 20994181551360.0, "grad_norm": 1.3946260054309163, "language_loss": 0.82909477, "learning_rate": 3.025844888884152e-06, "loss": 0.85016495, "num_input_tokens_seen": 117359525, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.53125, "step": 5466, "time_per_iteration": 2.384462594985962 }, { "auxiliary_loss_clip": 0.01077987, "auxiliary_loss_mlp": 0.01031917, "balance_loss_clip": 1.01669383, "balance_loss_mlp": 1.02449858, "epoch": 0.32869382233578837, "flos": 23657705441280.0, "grad_norm": 1.6523196533680227, "language_loss": 0.79685318, "learning_rate": 3.0255205346897646e-06, "loss": 0.81795228, "num_input_tokens_seen": 117380320, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.53515625, "step": 5467, "time_per_iteration": 3.8258657455444336 }, { "auxiliary_loss_clip": 0.01077876, "auxiliary_loss_mlp": 0.0103499, "balance_loss_clip": 1.01895642, "balance_loss_mlp": 1.02396202, "epoch": 0.32875394558845633, "flos": 25335386870400.0, "grad_norm": 1.5816377923003648, "language_loss": 0.74493074, "learning_rate": 3.0251961438971866e-06, "loss": 0.7660594, "num_input_tokens_seen": 117400695, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5390625, "step": 5468, "time_per_iteration": 2.4155049324035645 }, { "auxiliary_loss_clip": 0.01084114, "auxiliary_loss_mlp": 0.01036531, "balance_loss_clip": 1.01854181, "balance_loss_mlp": 1.02674651, "epoch": 0.3288140688411243, "flos": 14902221594240.0, "grad_norm": 2.5271646654174105, "language_loss": 0.78453338, "learning_rate": 3.024871716517996e-06, "loss": 0.80573982, "num_input_tokens_seen": 117418800, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.57421875, "step": 5469, "time_per_iteration": 2.3450305461883545 }, { "auxiliary_loss_clip": 0.01077556, "auxiliary_loss_mlp": 0.01028308, "balance_loss_clip": 1.01290655, "balance_loss_mlp": 1.02343345, "epoch": 0.32887419209379226, "flos": 18550366047360.0, "grad_norm": 1.8224757690139617, "language_loss": 0.81447303, "learning_rate": 3.0245472525637706e-06, "loss": 0.83553171, "num_input_tokens_seen": 117438220, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.5390625, "step": 5470, "time_per_iteration": 2.3762497901916504 }, { "auxiliary_loss_clip": 0.0107779, "auxiliary_loss_mlp": 0.01030068, "balance_loss_clip": 1.01378453, "balance_loss_mlp": 1.0233016, "epoch": 0.3289343153464602, "flos": 48103785360000.0, "grad_norm": 1.6722158362914659, "language_loss": 0.67574811, "learning_rate": 3.0242227520460885e-06, "loss": 0.69682676, "num_input_tokens_seen": 117462560, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.546875, "step": 5471, "time_per_iteration": 3.962620496749878 }, { "auxiliary_loss_clip": 0.01080355, "auxiliary_loss_mlp": 0.01033582, "balance_loss_clip": 1.01510489, "balance_loss_mlp": 1.02359247, "epoch": 0.3289944385991282, "flos": 27599050425600.0, "grad_norm": 7.410125431003147, "language_loss": 0.64579719, "learning_rate": 3.023898214976531e-06, "loss": 0.66693652, "num_input_tokens_seen": 117483665, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.56640625, "step": 5472, "time_per_iteration": 3.9157164096832275 }, { "auxiliary_loss_clip": 0.01077792, "auxiliary_loss_mlp": 0.01035754, "balance_loss_clip": 1.01887417, "balance_loss_mlp": 1.0238626, "epoch": 0.32905456185179616, "flos": 20119292991360.0, "grad_norm": 1.6672053581604682, "language_loss": 0.88222539, "learning_rate": 3.02357364136668e-06, "loss": 0.90336084, "num_input_tokens_seen": 117503565, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.5390625, "step": 5473, "time_per_iteration": 2.389453172683716 }, { "auxiliary_loss_clip": 0.01080474, "auxiliary_loss_mlp": 0.01038804, "balance_loss_clip": 1.02100611, "balance_loss_mlp": 1.02503395, "epoch": 0.3291146851044642, "flos": 23179255902720.0, "grad_norm": 5.034410165739722, "language_loss": 0.78064179, "learning_rate": 3.023249031228119e-06, "loss": 0.80183458, "num_input_tokens_seen": 117521460, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.5546875, "step": 5474, "time_per_iteration": 2.3733670711517334 }, { "auxiliary_loss_clip": 0.0101337, "auxiliary_loss_mlp": 0.01002093, "balance_loss_clip": 1.0008707, "balance_loss_mlp": 1.00199521, "epoch": 0.32917480835713214, "flos": 67618626249600.0, "grad_norm": 0.8068229367215125, "language_loss": 0.60187316, "learning_rate": 3.0229243845724323e-06, "loss": 0.62202775, "num_input_tokens_seen": 117580550, "router_z_loss_clip": 0.01220703, "router_z_loss_mlp": 0.11376953, "step": 5475, "time_per_iteration": 2.99570631980896 }, { "auxiliary_loss_clip": 0.01080725, "auxiliary_loss_mlp": 0.01035766, "balance_loss_clip": 1.01752687, "balance_loss_mlp": 1.02312112, "epoch": 0.3292349316098001, "flos": 27963299306880.0, "grad_norm": 3.683065370882492, "language_loss": 0.76878214, "learning_rate": 3.022599701411205e-06, "loss": 0.78994703, "num_input_tokens_seen": 117600645, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.578125, "step": 5476, "time_per_iteration": 3.75848388671875 }, { "auxiliary_loss_clip": 0.01080818, "auxiliary_loss_mlp": 0.01038302, "balance_loss_clip": 1.02189863, "balance_loss_mlp": 1.02523828, "epoch": 0.3292950548624681, "flos": 20262716323200.0, "grad_norm": 1.6678265199555182, "language_loss": 0.74488866, "learning_rate": 3.0222749817560252e-06, "loss": 0.76607984, "num_input_tokens_seen": 117618880, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.55859375, "step": 5477, "time_per_iteration": 2.3645784854888916 }, { "auxiliary_loss_clip": 0.01074658, "auxiliary_loss_mlp": 0.01030904, "balance_loss_clip": 1.01579463, "balance_loss_mlp": 1.02327657, "epoch": 0.32935517811513604, "flos": 20811969832320.0, "grad_norm": 2.019059742157605, "language_loss": 0.75214255, "learning_rate": 3.0219502256184804e-06, "loss": 0.77319813, "num_input_tokens_seen": 117636445, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.515625, "step": 5478, "time_per_iteration": 2.383361577987671 }, { "auxiliary_loss_clip": 0.01078613, "auxiliary_loss_mlp": 0.01034305, "balance_loss_clip": 1.01910627, "balance_loss_mlp": 1.025846, "epoch": 0.329415301367804, "flos": 18440878423680.0, "grad_norm": 2.0264913751417093, "language_loss": 0.8089844, "learning_rate": 3.0216254330101617e-06, "loss": 0.83011365, "num_input_tokens_seen": 117653105, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.53125, "step": 5479, "time_per_iteration": 2.34741473197937 }, { "auxiliary_loss_clip": 0.01012531, "auxiliary_loss_mlp": 0.01002723, "balance_loss_clip": 1.00146508, "balance_loss_mlp": 1.00145316, "epoch": 0.32947542462047197, "flos": 66319367713920.0, "grad_norm": 0.7611947774682117, "language_loss": 0.56492686, "learning_rate": 3.0213006039426587e-06, "loss": 0.58507937, "num_input_tokens_seen": 117719225, "router_z_loss_clip": 0.01257324, "router_z_loss_mlp": 0.11083984, "step": 5480, "time_per_iteration": 3.1084365844726562 }, { "auxiliary_loss_clip": 0.0107615, "auxiliary_loss_mlp": 0.01033383, "balance_loss_clip": 1.01784968, "balance_loss_mlp": 1.02299666, "epoch": 0.32953554787313993, "flos": 23220488085120.0, "grad_norm": 2.002717647561456, "language_loss": 0.7727921, "learning_rate": 3.0209757384275643e-06, "loss": 0.79388744, "num_input_tokens_seen": 117738725, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.53125, "step": 5481, "time_per_iteration": 2.3818211555480957 }, { "auxiliary_loss_clip": 0.01081065, "auxiliary_loss_mlp": 0.01030712, "balance_loss_clip": 1.01504767, "balance_loss_mlp": 1.02616775, "epoch": 0.3295956711258079, "flos": 27008460000000.0, "grad_norm": 1.5277859408892422, "language_loss": 0.78278458, "learning_rate": 3.020650836476472e-06, "loss": 0.80390239, "num_input_tokens_seen": 117757765, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.546875, "step": 5482, "time_per_iteration": 2.418118476867676 }, { "auxiliary_loss_clip": 0.01079361, "auxiliary_loss_mlp": 0.01034942, "balance_loss_clip": 1.01869941, "balance_loss_mlp": 1.0245055, "epoch": 0.32965579437847586, "flos": 19170702817920.0, "grad_norm": 2.179453532430209, "language_loss": 0.73739564, "learning_rate": 3.0203258981009767e-06, "loss": 0.75853866, "num_input_tokens_seen": 117776810, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.546875, "step": 5483, "time_per_iteration": 2.362783670425415 }, { "auxiliary_loss_clip": 0.01079387, "auxiliary_loss_mlp": 0.01028127, "balance_loss_clip": 1.01248038, "balance_loss_mlp": 1.02548862, "epoch": 0.32971591763114383, "flos": 30481200449280.0, "grad_norm": 2.2287266426919, "language_loss": 0.7526831, "learning_rate": 3.0200009233126745e-06, "loss": 0.77375823, "num_input_tokens_seen": 117797730, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.5390625, "step": 5484, "time_per_iteration": 2.4598166942596436 }, { "auxiliary_loss_clip": 0.0107882, "auxiliary_loss_mlp": 0.01035946, "balance_loss_clip": 1.02034175, "balance_loss_mlp": 1.02521205, "epoch": 0.3297760408838118, "flos": 16288657528320.0, "grad_norm": 1.8076020325860054, "language_loss": 0.71962059, "learning_rate": 3.0196759121231636e-06, "loss": 0.74076831, "num_input_tokens_seen": 117815365, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.53515625, "step": 5485, "time_per_iteration": 2.351759672164917 }, { "auxiliary_loss_clip": 0.01076681, "auxiliary_loss_mlp": 0.01040589, "balance_loss_clip": 1.02534819, "balance_loss_mlp": 1.02417111, "epoch": 0.32983616413647976, "flos": 29529712632960.0, "grad_norm": 1.6391832855517314, "language_loss": 0.80184996, "learning_rate": 3.0193508645440424e-06, "loss": 0.82302266, "num_input_tokens_seen": 117836095, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.5234375, "step": 5486, "time_per_iteration": 2.450106143951416 }, { "auxiliary_loss_clip": 0.01075837, "auxiliary_loss_mlp": 0.01029841, "balance_loss_clip": 1.01403403, "balance_loss_mlp": 1.02320254, "epoch": 0.3298962873891478, "flos": 20630351606400.0, "grad_norm": 2.028862755313964, "language_loss": 0.83958948, "learning_rate": 3.0190257805869106e-06, "loss": 0.86064625, "num_input_tokens_seen": 117854655, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.52734375, "step": 5487, "time_per_iteration": 2.365351676940918 }, { "auxiliary_loss_clip": 0.01082498, "auxiliary_loss_mlp": 0.0103583, "balance_loss_clip": 1.01861644, "balance_loss_mlp": 1.02514911, "epoch": 0.32995641064181574, "flos": 14975120246400.0, "grad_norm": 2.017911445322458, "language_loss": 0.73766994, "learning_rate": 3.01870066026337e-06, "loss": 0.75885332, "num_input_tokens_seen": 117873300, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.57421875, "step": 5488, "time_per_iteration": 2.3447751998901367 }, { "auxiliary_loss_clip": 0.01077945, "auxiliary_loss_mlp": 0.01036305, "balance_loss_clip": 1.01941299, "balance_loss_mlp": 1.02367687, "epoch": 0.3300165338944837, "flos": 18660447164160.0, "grad_norm": 2.191391902059211, "language_loss": 0.72819901, "learning_rate": 3.018375503585023e-06, "loss": 0.74934149, "num_input_tokens_seen": 117891540, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.5390625, "step": 5489, "time_per_iteration": 2.352830410003662 }, { "auxiliary_loss_clip": 0.01075289, "auxiliary_loss_mlp": 0.01033082, "balance_loss_clip": 1.01680982, "balance_loss_mlp": 1.02175522, "epoch": 0.3300766571471517, "flos": 25582816742400.0, "grad_norm": 2.533477887498127, "language_loss": 0.88714314, "learning_rate": 3.018050310563474e-06, "loss": 0.90822685, "num_input_tokens_seen": 117907690, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.53515625, "step": 5490, "time_per_iteration": 2.3791744709014893 }, { "auxiliary_loss_clip": 0.01078381, "auxiliary_loss_mlp": 0.01033527, "balance_loss_clip": 1.01782727, "balance_loss_mlp": 1.02359843, "epoch": 0.33013678039981964, "flos": 11362726892160.0, "grad_norm": 1.9242370586077664, "language_loss": 0.83497536, "learning_rate": 3.0177250812103286e-06, "loss": 0.85609448, "num_input_tokens_seen": 117925640, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.546875, "step": 5491, "time_per_iteration": 2.3475584983825684 }, { "auxiliary_loss_clip": 0.01076319, "auxiliary_loss_mlp": 0.01031083, "balance_loss_clip": 1.01445293, "balance_loss_mlp": 1.02417612, "epoch": 0.3301969036524876, "flos": 24820208714880.0, "grad_norm": 1.9356188378003132, "language_loss": 0.77759326, "learning_rate": 3.017399815537193e-06, "loss": 0.79866719, "num_input_tokens_seen": 117944525, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.5234375, "step": 5492, "time_per_iteration": 2.4017999172210693 }, { "auxiliary_loss_clip": 0.01082598, "auxiliary_loss_mlp": 0.01036579, "balance_loss_clip": 1.01918674, "balance_loss_mlp": 1.02648592, "epoch": 0.33025702690515557, "flos": 15960229568640.0, "grad_norm": 3.011100252222635, "language_loss": 0.74424094, "learning_rate": 3.0170745135556744e-06, "loss": 0.76543272, "num_input_tokens_seen": 117962515, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.5625, "step": 5493, "time_per_iteration": 2.3523318767547607 }, { "auxiliary_loss_clip": 0.01012345, "auxiliary_loss_mlp": 0.01000917, "balance_loss_clip": 0.99961793, "balance_loss_mlp": 1.001477, "epoch": 0.33031715015782354, "flos": 59413582897920.0, "grad_norm": 0.7828648661663578, "language_loss": 0.53932202, "learning_rate": 3.0167491752773826e-06, "loss": 0.55945462, "num_input_tokens_seen": 118018780, "router_z_loss_clip": 0.01300049, "router_z_loss_mlp": 0.10839844, "step": 5494, "time_per_iteration": 2.961301326751709 }, { "auxiliary_loss_clip": 0.01077667, "auxiliary_loss_mlp": 0.01034082, "balance_loss_clip": 1.01802456, "balance_loss_mlp": 1.02586997, "epoch": 0.3303772734104915, "flos": 23183270709120.0, "grad_norm": 5.756944714097515, "language_loss": 0.8663829, "learning_rate": 3.0164238007139285e-06, "loss": 0.88750041, "num_input_tokens_seen": 118038610, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.515625, "step": 5495, "time_per_iteration": 2.3944449424743652 }, { "auxiliary_loss_clip": 0.01082321, "auxiliary_loss_mlp": 0.01044525, "balance_loss_clip": 1.02660799, "balance_loss_mlp": 1.02547014, "epoch": 0.33043739666315947, "flos": 33070533966720.0, "grad_norm": 3.088849380326435, "language_loss": 0.73474109, "learning_rate": 3.0160983898769233e-06, "loss": 0.75600958, "num_input_tokens_seen": 118055905, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.5703125, "step": 5496, "time_per_iteration": 2.476020097732544 }, { "auxiliary_loss_clip": 0.01077013, "auxiliary_loss_mlp": 0.01031965, "balance_loss_clip": 1.01724291, "balance_loss_mlp": 1.02416658, "epoch": 0.33049751991582743, "flos": 24894399087360.0, "grad_norm": 2.065212289781025, "language_loss": 0.72320926, "learning_rate": 3.015772942777981e-06, "loss": 0.74429905, "num_input_tokens_seen": 118073695, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.52734375, "step": 5497, "time_per_iteration": 2.3937156200408936 }, { "auxiliary_loss_clip": 0.01080199, "auxiliary_loss_mlp": 0.01031382, "balance_loss_clip": 1.01635528, "balance_loss_mlp": 1.02741897, "epoch": 0.3305576431684954, "flos": 29459292687360.0, "grad_norm": 1.8311271164735543, "language_loss": 0.7991792, "learning_rate": 3.015447459428714e-06, "loss": 0.82029504, "num_input_tokens_seen": 118094030, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.52734375, "step": 5498, "time_per_iteration": 2.4372262954711914 }, { "auxiliary_loss_clip": 0.01077133, "auxiliary_loss_mlp": 0.01034207, "balance_loss_clip": 1.01810145, "balance_loss_mlp": 1.02355289, "epoch": 0.33061776642116336, "flos": 22631363936640.0, "grad_norm": 2.858740061844805, "language_loss": 0.76234901, "learning_rate": 3.01512193984074e-06, "loss": 0.78346241, "num_input_tokens_seen": 118111665, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.53515625, "step": 5499, "time_per_iteration": 2.391963005065918 }, { "auxiliary_loss_clip": 0.01076669, "auxiliary_loss_mlp": 0.01031135, "balance_loss_clip": 1.01641226, "balance_loss_mlp": 1.02401519, "epoch": 0.3306778896738313, "flos": 25775117844480.0, "grad_norm": 1.8301505462738015, "language_loss": 0.79005277, "learning_rate": 3.0147963840256748e-06, "loss": 0.81113082, "num_input_tokens_seen": 118132435, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.52734375, "step": 5500, "time_per_iteration": 2.4190316200256348 }, { "auxiliary_loss_clip": 0.01081682, "auxiliary_loss_mlp": 0.010331, "balance_loss_clip": 1.01567161, "balance_loss_mlp": 1.02551925, "epoch": 0.33073801292649935, "flos": 36939050121600.0, "grad_norm": 1.7803486323603586, "language_loss": 0.6642729, "learning_rate": 3.0144707919951376e-06, "loss": 0.68542069, "num_input_tokens_seen": 118155255, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.5625, "step": 5501, "time_per_iteration": 2.5176336765289307 }, { "auxiliary_loss_clip": 0.01079887, "auxiliary_loss_mlp": 0.01035618, "balance_loss_clip": 1.01729608, "balance_loss_mlp": 1.02398086, "epoch": 0.3307981361791673, "flos": 12966951087360.0, "grad_norm": 2.194220947815171, "language_loss": 0.7766012, "learning_rate": 3.014145163760747e-06, "loss": 0.79775625, "num_input_tokens_seen": 118169865, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.55859375, "step": 5502, "time_per_iteration": 2.33063006401062 }, { "auxiliary_loss_clip": 0.01082904, "auxiliary_loss_mlp": 0.0103426, "balance_loss_clip": 1.01813066, "balance_loss_mlp": 1.02699018, "epoch": 0.3308582594318353, "flos": 25373197739520.0, "grad_norm": 1.8803017426210131, "language_loss": 0.72392511, "learning_rate": 3.013819499334124e-06, "loss": 0.74509674, "num_input_tokens_seen": 118190760, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.55859375, "step": 5503, "time_per_iteration": 2.4150242805480957 }, { "auxiliary_loss_clip": 0.01079542, "auxiliary_loss_mlp": 0.01031826, "balance_loss_clip": 1.01608992, "balance_loss_mlp": 1.02399158, "epoch": 0.33091838268450324, "flos": 26467375749120.0, "grad_norm": 1.6156589334629954, "language_loss": 0.75059319, "learning_rate": 3.0134937987268913e-06, "loss": 0.77170682, "num_input_tokens_seen": 118213620, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.5546875, "step": 5504, "time_per_iteration": 2.43715238571167 }, { "auxiliary_loss_clip": 0.01079161, "auxiliary_loss_mlp": 0.01037461, "balance_loss_clip": 1.02143955, "balance_loss_mlp": 1.02522111, "epoch": 0.3309785059371712, "flos": 24970055736960.0, "grad_norm": 7.375256822080118, "language_loss": 0.69805634, "learning_rate": 3.013168061950672e-06, "loss": 0.71922266, "num_input_tokens_seen": 118235010, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5390625, "step": 5505, "time_per_iteration": 2.5208687782287598 }, { "auxiliary_loss_clip": 0.01080602, "auxiliary_loss_mlp": 0.01040268, "balance_loss_clip": 1.02393603, "balance_loss_mlp": 1.026914, "epoch": 0.3310386291898392, "flos": 20445731003520.0, "grad_norm": 1.613642332257, "language_loss": 0.82061791, "learning_rate": 3.0128422890170908e-06, "loss": 0.84182662, "num_input_tokens_seen": 118255820, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.5390625, "step": 5506, "time_per_iteration": 3.81140398979187 }, { "auxiliary_loss_clip": 0.01080435, "auxiliary_loss_mlp": 0.0103575, "balance_loss_clip": 1.01950145, "balance_loss_mlp": 1.02583778, "epoch": 0.33109875244250714, "flos": 23181629875200.0, "grad_norm": 1.7497305121320694, "language_loss": 0.79462695, "learning_rate": 3.0125164799377727e-06, "loss": 0.81578875, "num_input_tokens_seen": 118274160, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.546875, "step": 5507, "time_per_iteration": 2.378666400909424 }, { "auxiliary_loss_clip": 0.01077581, "auxiliary_loss_mlp": 0.01039683, "balance_loss_clip": 1.02302921, "balance_loss_mlp": 1.02317238, "epoch": 0.3311588756951751, "flos": 24167297779200.0, "grad_norm": 1.590986125933765, "language_loss": 0.71247351, "learning_rate": 3.0121906347243473e-06, "loss": 0.73364615, "num_input_tokens_seen": 118294385, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.54296875, "step": 5508, "time_per_iteration": 2.388368844985962 }, { "auxiliary_loss_clip": 0.01079258, "auxiliary_loss_mlp": 0.01029568, "balance_loss_clip": 1.0145123, "balance_loss_mlp": 1.02706981, "epoch": 0.33121899894784307, "flos": 28144533507840.0, "grad_norm": 1.7840217270823422, "language_loss": 0.71811736, "learning_rate": 3.011864753388441e-06, "loss": 0.7392056, "num_input_tokens_seen": 118313105, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.51953125, "step": 5509, "time_per_iteration": 2.4253482818603516 }, { "auxiliary_loss_clip": 0.01079044, "auxiliary_loss_mlp": 0.01034368, "balance_loss_clip": 1.01773834, "balance_loss_mlp": 1.02436996, "epoch": 0.33127912220051103, "flos": 29566441249920.0, "grad_norm": 1.549841388085175, "language_loss": 0.73306143, "learning_rate": 3.0115388359416845e-06, "loss": 0.75419545, "num_input_tokens_seen": 118335250, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.546875, "step": 5510, "time_per_iteration": 2.436173677444458 }, { "auxiliary_loss_clip": 0.01076664, "auxiliary_loss_mlp": 0.01036019, "balance_loss_clip": 1.01986647, "balance_loss_mlp": 1.02390003, "epoch": 0.331339245453179, "flos": 14427961418880.0, "grad_norm": 2.2954665977566537, "language_loss": 0.87709373, "learning_rate": 3.011212882395709e-06, "loss": 0.89822054, "num_input_tokens_seen": 118351470, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.52734375, "step": 5511, "time_per_iteration": 5.153402805328369 }, { "auxiliary_loss_clip": 0.0107445, "auxiliary_loss_mlp": 0.01031561, "balance_loss_clip": 1.01763129, "balance_loss_mlp": 1.02489877, "epoch": 0.33139936870584696, "flos": 20886055470720.0, "grad_norm": 1.7593741678117978, "language_loss": 0.73084962, "learning_rate": 3.010886892762147e-06, "loss": 0.75190973, "num_input_tokens_seen": 118370970, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.49609375, "step": 5512, "time_per_iteration": 2.3822708129882812 }, { "auxiliary_loss_clip": 0.01078026, "auxiliary_loss_mlp": 0.01035621, "balance_loss_clip": 1.02036226, "balance_loss_mlp": 1.02561522, "epoch": 0.33145949195851493, "flos": 36282857518080.0, "grad_norm": 1.7300419795133566, "language_loss": 0.72435218, "learning_rate": 3.0105608670526317e-06, "loss": 0.74548864, "num_input_tokens_seen": 118393125, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.5234375, "step": 5513, "time_per_iteration": 2.4983346462249756 }, { "auxiliary_loss_clip": 0.01081727, "auxiliary_loss_mlp": 0.01037217, "balance_loss_clip": 1.01943088, "balance_loss_mlp": 1.0249964, "epoch": 0.33151961521118295, "flos": 14278952269440.0, "grad_norm": 1.9408920491872914, "language_loss": 0.68344891, "learning_rate": 3.010234805278799e-06, "loss": 0.70463836, "num_input_tokens_seen": 118410860, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.56640625, "step": 5514, "time_per_iteration": 2.3421661853790283 }, { "auxiliary_loss_clip": 0.01079897, "auxiliary_loss_mlp": 0.01039622, "balance_loss_clip": 1.02193093, "balance_loss_mlp": 1.02536178, "epoch": 0.3315797384638509, "flos": 20773356001920.0, "grad_norm": 2.689355754922417, "language_loss": 0.66556174, "learning_rate": 3.0099087074522844e-06, "loss": 0.68675697, "num_input_tokens_seen": 118429570, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.546875, "step": 5515, "time_per_iteration": 2.378521203994751 }, { "auxiliary_loss_clip": 0.01078409, "auxiliary_loss_mlp": 0.01030626, "balance_loss_clip": 1.01386523, "balance_loss_mlp": 1.02416182, "epoch": 0.3316398617165189, "flos": 24678356394240.0, "grad_norm": 1.5437653433032403, "language_loss": 0.69410121, "learning_rate": 3.009582573584726e-06, "loss": 0.71519154, "num_input_tokens_seen": 118450285, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.5390625, "step": 5516, "time_per_iteration": 3.8009870052337646 }, { "auxiliary_loss_clip": 0.01078705, "auxiliary_loss_mlp": 0.01031319, "balance_loss_clip": 1.01446319, "balance_loss_mlp": 1.02335072, "epoch": 0.33169998496918685, "flos": 18586989930240.0, "grad_norm": 2.5286791546493124, "language_loss": 0.80638069, "learning_rate": 3.0092564036877624e-06, "loss": 0.82748091, "num_input_tokens_seen": 118468270, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.5546875, "step": 5517, "time_per_iteration": 2.361835241317749 }, { "auxiliary_loss_clip": 0.0107473, "auxiliary_loss_mlp": 0.01032545, "balance_loss_clip": 1.01736951, "balance_loss_mlp": 1.02369392, "epoch": 0.3317601082218548, "flos": 20192610579840.0, "grad_norm": 1.8632426934898658, "language_loss": 0.74322176, "learning_rate": 3.0089301977730343e-06, "loss": 0.76429451, "num_input_tokens_seen": 118486615, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.51171875, "step": 5518, "time_per_iteration": 2.362668514251709 }, { "auxiliary_loss_clip": 0.01013505, "auxiliary_loss_mlp": 0.01001692, "balance_loss_clip": 1.00026703, "balance_loss_mlp": 1.00193334, "epoch": 0.3318202314745228, "flos": 68971301032320.0, "grad_norm": 0.6087780292243, "language_loss": 0.54339445, "learning_rate": 3.008603955852182e-06, "loss": 0.56354642, "num_input_tokens_seen": 118553580, "router_z_loss_clip": 0.01422119, "router_z_loss_mlp": 0.11572266, "step": 5519, "time_per_iteration": 3.1202445030212402 }, { "auxiliary_loss_clip": 0.01077966, "auxiliary_loss_mlp": 0.01031499, "balance_loss_clip": 1.01454747, "balance_loss_mlp": 1.02486324, "epoch": 0.33188035472719074, "flos": 21499235412480.0, "grad_norm": 2.2352145529889387, "language_loss": 0.78812635, "learning_rate": 3.00827767793685e-06, "loss": 0.80922097, "num_input_tokens_seen": 118570280, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.53125, "step": 5520, "time_per_iteration": 2.3668649196624756 }, { "auxiliary_loss_clip": 0.01076427, "auxiliary_loss_mlp": 0.01029362, "balance_loss_clip": 1.01472282, "balance_loss_mlp": 1.02562094, "epoch": 0.3319404779798587, "flos": 28869400488960.0, "grad_norm": 1.677298408178834, "language_loss": 0.76335812, "learning_rate": 3.0079513640386806e-06, "loss": 0.78441596, "num_input_tokens_seen": 118590455, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.5078125, "step": 5521, "time_per_iteration": 2.426194667816162 }, { "auxiliary_loss_clip": 0.01078363, "auxiliary_loss_mlp": 0.01032232, "balance_loss_clip": 1.01630592, "balance_loss_mlp": 1.02395725, "epoch": 0.33200060123252667, "flos": 23075773032960.0, "grad_norm": 2.390168532822534, "language_loss": 0.70330912, "learning_rate": 3.00762501416932e-06, "loss": 0.72441512, "num_input_tokens_seen": 118609495, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.54296875, "step": 5522, "time_per_iteration": 2.376386880874634 }, { "auxiliary_loss_clip": 0.01076956, "auxiliary_loss_mlp": 0.01031793, "balance_loss_clip": 1.01673639, "balance_loss_mlp": 1.02484918, "epoch": 0.33206072448519464, "flos": 21141410221440.0, "grad_norm": 1.7767204860105101, "language_loss": 0.73826516, "learning_rate": 3.007298628340414e-06, "loss": 0.75935268, "num_input_tokens_seen": 118628720, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.51953125, "step": 5523, "time_per_iteration": 2.4023706912994385 }, { "auxiliary_loss_clip": 0.01073963, "auxiliary_loss_mlp": 0.01035024, "balance_loss_clip": 1.01871657, "balance_loss_mlp": 1.02310383, "epoch": 0.3321208477378626, "flos": 13078254101760.0, "grad_norm": 1.6769998269019346, "language_loss": 0.81628752, "learning_rate": 3.0069722065636114e-06, "loss": 0.83737737, "num_input_tokens_seen": 118645955, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.5078125, "step": 5524, "time_per_iteration": 2.3407528400421143 }, { "auxiliary_loss_clip": 0.0107441, "auxiliary_loss_mlp": 0.01029288, "balance_loss_clip": 1.01440465, "balance_loss_mlp": 1.02322602, "epoch": 0.33218097099053057, "flos": 21214343784960.0, "grad_norm": 1.9232459480305069, "language_loss": 0.82582688, "learning_rate": 3.006645748850561e-06, "loss": 0.84686387, "num_input_tokens_seen": 118665605, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.51171875, "step": 5525, "time_per_iteration": 2.3984243869781494 }, { "auxiliary_loss_clip": 0.01012263, "auxiliary_loss_mlp": 0.01001452, "balance_loss_clip": 1.00016451, "balance_loss_mlp": 1.00131154, "epoch": 0.33224109424319853, "flos": 64345483376640.0, "grad_norm": 0.765381682479276, "language_loss": 0.52468234, "learning_rate": 3.006319255212913e-06, "loss": 0.54481953, "num_input_tokens_seen": 118728155, "router_z_loss_clip": 0.01287842, "router_z_loss_mlp": 0.109375, "step": 5526, "time_per_iteration": 2.991244316101074 }, { "auxiliary_loss_clip": 0.01078519, "auxiliary_loss_mlp": 0.01038393, "balance_loss_clip": 1.02202523, "balance_loss_mlp": 1.02494764, "epoch": 0.33230121749586655, "flos": 17345094491520.0, "grad_norm": 1.992856585999236, "language_loss": 0.77381909, "learning_rate": 3.0059927256623195e-06, "loss": 0.79498827, "num_input_tokens_seen": 118743955, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.53515625, "step": 5527, "time_per_iteration": 2.3451356887817383 }, { "auxiliary_loss_clip": 0.01079503, "auxiliary_loss_mlp": 0.0103317, "balance_loss_clip": 1.01830423, "balance_loss_mlp": 1.02678227, "epoch": 0.3323613407485345, "flos": 20995962030720.0, "grad_norm": 2.2897982068716547, "language_loss": 0.71828073, "learning_rate": 3.005666160210434e-06, "loss": 0.73940748, "num_input_tokens_seen": 118763275, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.52734375, "step": 5528, "time_per_iteration": 2.368333101272583 }, { "auxiliary_loss_clip": 0.01076571, "auxiliary_loss_mlp": 0.01025358, "balance_loss_clip": 1.0109458, "balance_loss_mlp": 1.02340984, "epoch": 0.3324214640012025, "flos": 13151676424320.0, "grad_norm": 1.6105959128340976, "language_loss": 0.82893622, "learning_rate": 3.005339558868909e-06, "loss": 0.8499555, "num_input_tokens_seen": 118781110, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.53125, "step": 5529, "time_per_iteration": 2.357602834701538 }, { "auxiliary_loss_clip": 0.01079475, "auxiliary_loss_mlp": 0.0103376, "balance_loss_clip": 1.01735663, "balance_loss_mlp": 1.02448368, "epoch": 0.33248158725387045, "flos": 22272421582080.0, "grad_norm": 2.561172822028531, "language_loss": 0.69755512, "learning_rate": 3.0050129216494017e-06, "loss": 0.71868747, "num_input_tokens_seen": 118800620, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.546875, "step": 5530, "time_per_iteration": 2.370915412902832 }, { "auxiliary_loss_clip": 0.0107758, "auxiliary_loss_mlp": 0.01037905, "balance_loss_clip": 1.02079821, "balance_loss_mlp": 1.02417314, "epoch": 0.3325417105065384, "flos": 20739943964160.0, "grad_norm": 2.457266075947352, "language_loss": 0.76323688, "learning_rate": 3.004686248563569e-06, "loss": 0.78439176, "num_input_tokens_seen": 118818725, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.53515625, "step": 5531, "time_per_iteration": 2.375516414642334 }, { "auxiliary_loss_clip": 0.0107839, "auxiliary_loss_mlp": 0.01031938, "balance_loss_clip": 1.01682234, "balance_loss_mlp": 1.02428508, "epoch": 0.3326018337592064, "flos": 24789380117760.0, "grad_norm": 1.9400348786361958, "language_loss": 0.7339288, "learning_rate": 3.0043595396230675e-06, "loss": 0.75503206, "num_input_tokens_seen": 118839390, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.5390625, "step": 5532, "time_per_iteration": 2.408520460128784 }, { "auxiliary_loss_clip": 0.01076965, "auxiliary_loss_mlp": 0.01028579, "balance_loss_clip": 1.01318359, "balance_loss_mlp": 1.02466965, "epoch": 0.33266195701187434, "flos": 14500825159680.0, "grad_norm": 2.0078269637567883, "language_loss": 0.65814972, "learning_rate": 3.004032794839558e-06, "loss": 0.67920512, "num_input_tokens_seen": 118856275, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.5234375, "step": 5533, "time_per_iteration": 2.3536040782928467 }, { "auxiliary_loss_clip": 0.01078917, "auxiliary_loss_mlp": 0.01029114, "balance_loss_clip": 1.01380777, "balance_loss_mlp": 1.02548361, "epoch": 0.3327220802645423, "flos": 15303513294720.0, "grad_norm": 2.0519894762515407, "language_loss": 0.71043754, "learning_rate": 3.0037060142247006e-06, "loss": 0.73151791, "num_input_tokens_seen": 118873830, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.53515625, "step": 5534, "time_per_iteration": 2.3512964248657227 }, { "auxiliary_loss_clip": 0.01075729, "auxiliary_loss_mlp": 0.01026959, "balance_loss_clip": 1.0120821, "balance_loss_mlp": 1.02500868, "epoch": 0.3327822035172103, "flos": 23476401417600.0, "grad_norm": 2.238261373473902, "language_loss": 0.66874146, "learning_rate": 3.0033791977901582e-06, "loss": 0.68976831, "num_input_tokens_seen": 118891560, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.5078125, "step": 5535, "time_per_iteration": 2.3810787200927734 }, { "auxiliary_loss_clip": 0.01076021, "auxiliary_loss_mlp": 0.01033247, "balance_loss_clip": 1.01820302, "balance_loss_mlp": 1.02380037, "epoch": 0.33284232676987824, "flos": 25373337384960.0, "grad_norm": 3.5856777647169307, "language_loss": 0.72643864, "learning_rate": 3.0030523455475923e-06, "loss": 0.74753129, "num_input_tokens_seen": 118910260, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.5234375, "step": 5536, "time_per_iteration": 2.4156932830810547 }, { "auxiliary_loss_clip": 0.01075153, "auxiliary_loss_mlp": 0.01033417, "balance_loss_clip": 1.01839089, "balance_loss_mlp": 1.02250683, "epoch": 0.3329024500225462, "flos": 23693281983360.0, "grad_norm": 1.7974857731830458, "language_loss": 0.81679094, "learning_rate": 3.0027254575086683e-06, "loss": 0.83787668, "num_input_tokens_seen": 118929985, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.52734375, "step": 5537, "time_per_iteration": 2.4111626148223877 }, { "auxiliary_loss_clip": 0.01081624, "auxiliary_loss_mlp": 0.01034877, "balance_loss_clip": 1.01918864, "balance_loss_mlp": 1.02702272, "epoch": 0.33296257327521417, "flos": 31721804167680.0, "grad_norm": 3.531693637137087, "language_loss": 0.713952, "learning_rate": 3.0023985336850526e-06, "loss": 0.73511702, "num_input_tokens_seen": 118951355, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.546875, "step": 5538, "time_per_iteration": 2.465549945831299 }, { "auxiliary_loss_clip": 0.01074623, "auxiliary_loss_mlp": 0.01028647, "balance_loss_clip": 1.01397192, "balance_loss_mlp": 1.02383459, "epoch": 0.33302269652788213, "flos": 22743679380480.0, "grad_norm": 1.7361844059001326, "language_loss": 0.74090689, "learning_rate": 3.0020715740884112e-06, "loss": 0.76193959, "num_input_tokens_seen": 118970910, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.5078125, "step": 5539, "time_per_iteration": 2.3753397464752197 }, { "auxiliary_loss_clip": 0.01079784, "auxiliary_loss_mlp": 0.01035945, "balance_loss_clip": 1.01948261, "balance_loss_mlp": 1.02391446, "epoch": 0.33308281978055015, "flos": 11472947654400.0, "grad_norm": 2.257751568342649, "language_loss": 0.71164829, "learning_rate": 3.001744578730413e-06, "loss": 0.73280561, "num_input_tokens_seen": 118989200, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.55859375, "step": 5540, "time_per_iteration": 2.357773542404175 }, { "auxiliary_loss_clip": 0.01073691, "auxiliary_loss_mlp": 0.01027466, "balance_loss_clip": 1.01273119, "balance_loss_mlp": 1.02216721, "epoch": 0.3331429430332181, "flos": 38212262916480.0, "grad_norm": 1.6121686349596964, "language_loss": 0.60606539, "learning_rate": 3.0014175476227284e-06, "loss": 0.62707698, "num_input_tokens_seen": 119011030, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.515625, "step": 5541, "time_per_iteration": 2.5146448612213135 }, { "auxiliary_loss_clip": 0.01076282, "auxiliary_loss_mlp": 0.01033443, "balance_loss_clip": 1.01733828, "balance_loss_mlp": 1.02213919, "epoch": 0.3332030662858861, "flos": 22527566864640.0, "grad_norm": 2.82336732415007, "language_loss": 0.68974257, "learning_rate": 3.0010904807770267e-06, "loss": 0.71083981, "num_input_tokens_seen": 119030620, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.5390625, "step": 5542, "time_per_iteration": 2.381967544555664 }, { "auxiliary_loss_clip": 0.0107615, "auxiliary_loss_mlp": 0.01030972, "balance_loss_clip": 1.01657724, "balance_loss_mlp": 1.02423632, "epoch": 0.33326318953855405, "flos": 15996853451520.0, "grad_norm": 1.5508973421615084, "language_loss": 0.75223792, "learning_rate": 3.0007633782049808e-06, "loss": 0.77330911, "num_input_tokens_seen": 119048015, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.51953125, "step": 5543, "time_per_iteration": 2.347586154937744 }, { "auxiliary_loss_clip": 0.01079862, "auxiliary_loss_mlp": 0.01028852, "balance_loss_clip": 1.01336658, "balance_loss_mlp": 1.02651429, "epoch": 0.333323312791222, "flos": 25592347543680.0, "grad_norm": 37.04893001160876, "language_loss": 0.74965572, "learning_rate": 3.000436239918264e-06, "loss": 0.77074289, "num_input_tokens_seen": 119066280, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.53125, "step": 5544, "time_per_iteration": 2.416640281677246 }, { "auxiliary_loss_clip": 0.01073255, "auxiliary_loss_mlp": 0.01028505, "balance_loss_clip": 1.01365125, "balance_loss_mlp": 1.02256787, "epoch": 0.33338343604389, "flos": 25118366659200.0, "grad_norm": 2.065096202813144, "language_loss": 0.70477557, "learning_rate": 3.0001090659285514e-06, "loss": 0.72579312, "num_input_tokens_seen": 119087680, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.5078125, "step": 5545, "time_per_iteration": 2.3997411727905273 }, { "auxiliary_loss_clip": 0.01073648, "auxiliary_loss_mlp": 0.01029524, "balance_loss_clip": 1.01487279, "balance_loss_mlp": 1.02298927, "epoch": 0.33344355929655795, "flos": 16946316408960.0, "grad_norm": 1.9542117608094833, "language_loss": 0.69310373, "learning_rate": 2.9997818562475194e-06, "loss": 0.71413547, "num_input_tokens_seen": 119105820, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.5078125, "step": 5546, "time_per_iteration": 3.830307722091675 }, { "auxiliary_loss_clip": 0.0107685, "auxiliary_loss_mlp": 0.0103088, "balance_loss_clip": 1.01484632, "balance_loss_mlp": 1.02271593, "epoch": 0.3335036825492259, "flos": 27888410707200.0, "grad_norm": 1.530255302838695, "language_loss": 0.64674079, "learning_rate": 2.999454610886844e-06, "loss": 0.66781807, "num_input_tokens_seen": 119126630, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5390625, "step": 5547, "time_per_iteration": 2.445080280303955 }, { "auxiliary_loss_clip": 0.01073803, "auxiliary_loss_mlp": 0.0102841, "balance_loss_clip": 1.01397347, "balance_loss_mlp": 1.02250731, "epoch": 0.3335638058018939, "flos": 16178646234240.0, "grad_norm": 2.3783957839733847, "language_loss": 0.84989339, "learning_rate": 2.999127329858205e-06, "loss": 0.87091547, "num_input_tokens_seen": 119143375, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.51171875, "step": 5548, "time_per_iteration": 2.3369579315185547 }, { "auxiliary_loss_clip": 0.01075009, "auxiliary_loss_mlp": 0.01028834, "balance_loss_clip": 1.01317048, "balance_loss_mlp": 1.02133405, "epoch": 0.33362392905456184, "flos": 39894517733760.0, "grad_norm": 2.024635341949424, "language_loss": 0.74164677, "learning_rate": 2.9988000131732813e-06, "loss": 0.76268518, "num_input_tokens_seen": 119166450, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.53515625, "step": 5549, "time_per_iteration": 2.5600130558013916 }, { "auxiliary_loss_clip": 0.01077087, "auxiliary_loss_mlp": 0.0103115, "balance_loss_clip": 1.01602232, "balance_loss_mlp": 1.02468634, "epoch": 0.3336840523072298, "flos": 44269588938240.0, "grad_norm": 2.005761780232257, "language_loss": 0.68795037, "learning_rate": 2.998472660843755e-06, "loss": 0.70903265, "num_input_tokens_seen": 119189645, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.5234375, "step": 5550, "time_per_iteration": 4.060149669647217 }, { "auxiliary_loss_clip": 0.01074862, "auxiliary_loss_mlp": 0.01030884, "balance_loss_clip": 1.01628661, "balance_loss_mlp": 1.02353942, "epoch": 0.33374417555989777, "flos": 15084782426880.0, "grad_norm": 1.7187397239201687, "language_loss": 0.60497022, "learning_rate": 2.998145272881307e-06, "loss": 0.62602764, "num_input_tokens_seen": 119208045, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.51171875, "step": 5551, "time_per_iteration": 3.7055156230926514 }, { "auxiliary_loss_clip": 0.01074716, "auxiliary_loss_mlp": 0.01028059, "balance_loss_clip": 1.01322913, "balance_loss_mlp": 1.02368164, "epoch": 0.33380429881256574, "flos": 15848333061120.0, "grad_norm": 1.6057997482365778, "language_loss": 0.70380235, "learning_rate": 2.997817849297622e-06, "loss": 0.72483003, "num_input_tokens_seen": 119224910, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.51171875, "step": 5552, "time_per_iteration": 2.349670886993408 }, { "auxiliary_loss_clip": 0.01075229, "auxiliary_loss_mlp": 0.01030936, "balance_loss_clip": 1.01615965, "balance_loss_mlp": 1.02256203, "epoch": 0.33386442206523376, "flos": 13479475979520.0, "grad_norm": 2.0648316931796185, "language_loss": 0.83294916, "learning_rate": 2.997490390104385e-06, "loss": 0.85401082, "num_input_tokens_seen": 119243290, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.52734375, "step": 5553, "time_per_iteration": 2.3437111377716064 }, { "auxiliary_loss_clip": 0.01076369, "auxiliary_loss_mlp": 0.01037392, "balance_loss_clip": 1.02192426, "balance_loss_mlp": 1.0237931, "epoch": 0.3339245453179017, "flos": 16689739760640.0, "grad_norm": 1.8340308159497167, "language_loss": 0.81052446, "learning_rate": 2.9971628953132815e-06, "loss": 0.83166212, "num_input_tokens_seen": 119261195, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.5234375, "step": 5554, "time_per_iteration": 2.3397092819213867 }, { "auxiliary_loss_clip": 0.01074086, "auxiliary_loss_mlp": 0.01030403, "balance_loss_clip": 1.01522732, "balance_loss_mlp": 1.02320063, "epoch": 0.3339846685705697, "flos": 24609402725760.0, "grad_norm": 1.519066197738685, "language_loss": 0.81127423, "learning_rate": 2.9968353649359996e-06, "loss": 0.83231908, "num_input_tokens_seen": 119282845, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.5078125, "step": 5555, "time_per_iteration": 2.426910161972046 }, { "auxiliary_loss_clip": 0.01073358, "auxiliary_loss_mlp": 0.01027621, "balance_loss_clip": 1.01359582, "balance_loss_mlp": 1.02268112, "epoch": 0.33404479182323765, "flos": 30952562981760.0, "grad_norm": 1.6786413419295263, "language_loss": 0.74415982, "learning_rate": 2.996507798984227e-06, "loss": 0.76516962, "num_input_tokens_seen": 119304430, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.5078125, "step": 5556, "time_per_iteration": 3.8798019886016846 }, { "auxiliary_loss_clip": 0.01074321, "auxiliary_loss_mlp": 0.01030326, "balance_loss_clip": 1.01603329, "balance_loss_mlp": 1.02379119, "epoch": 0.3341049150759056, "flos": 23512187427840.0, "grad_norm": 1.8836318491891912, "language_loss": 0.82044494, "learning_rate": 2.9961801974696546e-06, "loss": 0.8414914, "num_input_tokens_seen": 119323830, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.5078125, "step": 5557, "time_per_iteration": 2.3874154090881348 }, { "auxiliary_loss_clip": 0.01075775, "auxiliary_loss_mlp": 0.01032975, "balance_loss_clip": 1.01826465, "balance_loss_mlp": 1.02358103, "epoch": 0.3341650383285736, "flos": 24025620015360.0, "grad_norm": 2.2757069998563657, "language_loss": 0.80175179, "learning_rate": 2.995852560403974e-06, "loss": 0.82283926, "num_input_tokens_seen": 119346340, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.5234375, "step": 5558, "time_per_iteration": 2.4520602226257324 }, { "auxiliary_loss_clip": 0.01074148, "auxiliary_loss_mlp": 0.01028048, "balance_loss_clip": 1.01390362, "balance_loss_mlp": 1.02303123, "epoch": 0.33422516158124155, "flos": 24900752954880.0, "grad_norm": 1.7410947955627079, "language_loss": 0.8146323, "learning_rate": 2.9955248877988767e-06, "loss": 0.83565426, "num_input_tokens_seen": 119367285, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.51171875, "step": 5559, "time_per_iteration": 2.4129600524902344 }, { "auxiliary_loss_clip": 0.01074564, "auxiliary_loss_mlp": 0.0103193, "balance_loss_clip": 1.01679003, "balance_loss_mlp": 1.02415633, "epoch": 0.3342852848339095, "flos": 18332403229440.0, "grad_norm": 3.4040807078479327, "language_loss": 0.7177844, "learning_rate": 2.9951971796660565e-06, "loss": 0.73884928, "num_input_tokens_seen": 119385370, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.50390625, "step": 5560, "time_per_iteration": 2.368154287338257 }, { "auxiliary_loss_clip": 0.01077398, "auxiliary_loss_mlp": 0.01032648, "balance_loss_clip": 1.01735377, "balance_loss_mlp": 1.02394676, "epoch": 0.3343454080865775, "flos": 30045170079360.0, "grad_norm": 1.5090462505936386, "language_loss": 0.75033867, "learning_rate": 2.994869436017209e-06, "loss": 0.77143919, "num_input_tokens_seen": 119409150, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.53515625, "step": 5561, "time_per_iteration": 2.4563941955566406 }, { "auxiliary_loss_clip": 0.01075304, "auxiliary_loss_mlp": 0.01031141, "balance_loss_clip": 1.0162518, "balance_loss_mlp": 1.02332568, "epoch": 0.33440553133924544, "flos": 16397900772480.0, "grad_norm": 1.636777793952095, "language_loss": 0.69598168, "learning_rate": 2.9945416568640314e-06, "loss": 0.71704608, "num_input_tokens_seen": 119426475, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.51953125, "step": 5562, "time_per_iteration": 2.3684780597686768 }, { "auxiliary_loss_clip": 0.0107414, "auxiliary_loss_mlp": 0.01031801, "balance_loss_clip": 1.01791883, "balance_loss_mlp": 1.02342868, "epoch": 0.3344656545919134, "flos": 24240964481280.0, "grad_norm": 2.016993099031963, "language_loss": 0.64630532, "learning_rate": 2.99421384221822e-06, "loss": 0.66736472, "num_input_tokens_seen": 119446900, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.5078125, "step": 5563, "time_per_iteration": 2.391129493713379 }, { "auxiliary_loss_clip": 0.01078389, "auxiliary_loss_mlp": 0.01036197, "balance_loss_clip": 1.02034795, "balance_loss_mlp": 1.02499962, "epoch": 0.3345257778445814, "flos": 52116911832960.0, "grad_norm": 4.5676194274434065, "language_loss": 0.74270809, "learning_rate": 2.9938859920914735e-06, "loss": 0.76385391, "num_input_tokens_seen": 119470945, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.53125, "step": 5564, "time_per_iteration": 2.6527907848358154 }, { "auxiliary_loss_clip": 0.01016207, "auxiliary_loss_mlp": 0.01003749, "balance_loss_clip": 1.00217545, "balance_loss_mlp": 1.00512195, "epoch": 0.33458590109724934, "flos": 68045614577280.0, "grad_norm": 0.7741006322130335, "language_loss": 0.55468792, "learning_rate": 2.9935581064954934e-06, "loss": 0.57488745, "num_input_tokens_seen": 119529925, "router_z_loss_clip": 0.01574707, "router_z_loss_mlp": 0.11083984, "step": 5565, "time_per_iteration": 2.97763991355896 }, { "auxiliary_loss_clip": 0.01073709, "auxiliary_loss_mlp": 0.01028739, "balance_loss_clip": 1.01495206, "balance_loss_mlp": 1.02421498, "epoch": 0.3346460243499173, "flos": 37413275385600.0, "grad_norm": 2.1109905326199274, "language_loss": 0.6467492, "learning_rate": 2.9932301854419794e-06, "loss": 0.66777366, "num_input_tokens_seen": 119550700, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.49414062, "step": 5566, "time_per_iteration": 2.514491081237793 }, { "auxiliary_loss_clip": 0.01075475, "auxiliary_loss_mlp": 0.01030986, "balance_loss_clip": 1.01588249, "balance_loss_mlp": 1.02459669, "epoch": 0.3347061476025853, "flos": 18696372819840.0, "grad_norm": 2.2253847357918835, "language_loss": 0.77230692, "learning_rate": 2.9929022289426352e-06, "loss": 0.79337156, "num_input_tokens_seen": 119569295, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.5078125, "step": 5567, "time_per_iteration": 2.34729266166687 }, { "auxiliary_loss_clip": 0.01076869, "auxiliary_loss_mlp": 0.01030514, "balance_loss_clip": 1.01514792, "balance_loss_mlp": 1.02460814, "epoch": 0.3347662708552533, "flos": 13916972626560.0, "grad_norm": 2.313692891187955, "language_loss": 0.75705385, "learning_rate": 2.9925742370091645e-06, "loss": 0.77812767, "num_input_tokens_seen": 119587375, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.51953125, "step": 5568, "time_per_iteration": 2.3547399044036865 }, { "auxiliary_loss_clip": 0.01076316, "auxiliary_loss_mlp": 0.0103273, "balance_loss_clip": 1.01811469, "balance_loss_mlp": 1.02347338, "epoch": 0.33482639410792125, "flos": 19749528115200.0, "grad_norm": 2.0441753961713163, "language_loss": 0.70955795, "learning_rate": 2.992246209653272e-06, "loss": 0.7306484, "num_input_tokens_seen": 119604530, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.52734375, "step": 5569, "time_per_iteration": 2.354771137237549 }, { "auxiliary_loss_clip": 0.01076972, "auxiliary_loss_mlp": 0.01028026, "balance_loss_clip": 1.01256418, "balance_loss_mlp": 1.02347755, "epoch": 0.3348865173605892, "flos": 16102186623360.0, "grad_norm": 2.198998616787981, "language_loss": 0.89482254, "learning_rate": 2.9919181468866653e-06, "loss": 0.91587257, "num_input_tokens_seen": 119621025, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.53125, "step": 5570, "time_per_iteration": 2.352205991744995 }, { "auxiliary_loss_clip": 0.01074117, "auxiliary_loss_mlp": 0.01025588, "balance_loss_clip": 1.01072288, "balance_loss_mlp": 1.02277756, "epoch": 0.3349466406132572, "flos": 25007796783360.0, "grad_norm": 2.4539795075892705, "language_loss": 0.79679501, "learning_rate": 2.9915900487210514e-06, "loss": 0.81779206, "num_input_tokens_seen": 119641725, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.515625, "step": 5571, "time_per_iteration": 2.400282382965088 }, { "auxiliary_loss_clip": 0.01015455, "auxiliary_loss_mlp": 0.01005221, "balance_loss_clip": 1.00393963, "balance_loss_mlp": 1.00460982, "epoch": 0.33500676386592515, "flos": 54316647089280.0, "grad_norm": 0.9081892765750889, "language_loss": 0.55968076, "learning_rate": 2.991261915168139e-06, "loss": 0.57988751, "num_input_tokens_seen": 119693560, "router_z_loss_clip": 0.01281738, "router_z_loss_mlp": 0.10839844, "step": 5572, "time_per_iteration": 2.953749895095825 }, { "auxiliary_loss_clip": 0.01076335, "auxiliary_loss_mlp": 0.01034884, "balance_loss_clip": 1.0207938, "balance_loss_mlp": 1.02485335, "epoch": 0.3350668871185931, "flos": 26796117911040.0, "grad_norm": 2.078657938560479, "language_loss": 0.7806412, "learning_rate": 2.990933746239639e-06, "loss": 0.8017534, "num_input_tokens_seen": 119712935, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.515625, "step": 5573, "time_per_iteration": 2.4105515480041504 }, { "auxiliary_loss_clip": 0.0107709, "auxiliary_loss_mlp": 0.01038027, "balance_loss_clip": 1.02286911, "balance_loss_mlp": 1.02408218, "epoch": 0.3351270103712611, "flos": 33509112865920.0, "grad_norm": 2.509549001053669, "language_loss": 0.72811186, "learning_rate": 2.9906055419472622e-06, "loss": 0.74926305, "num_input_tokens_seen": 119731680, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.53125, "step": 5574, "time_per_iteration": 2.4823031425476074 }, { "auxiliary_loss_clip": 0.01073969, "auxiliary_loss_mlp": 0.01032803, "balance_loss_clip": 1.01743698, "balance_loss_mlp": 1.02348042, "epoch": 0.33518713362392905, "flos": 26505012061440.0, "grad_norm": 1.6871356426871273, "language_loss": 0.87748444, "learning_rate": 2.9902773023027224e-06, "loss": 0.89855218, "num_input_tokens_seen": 119752155, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.50390625, "step": 5575, "time_per_iteration": 2.4122045040130615 }, { "auxiliary_loss_clip": 0.01078844, "auxiliary_loss_mlp": 0.01033762, "balance_loss_clip": 1.01659584, "balance_loss_mlp": 1.02330542, "epoch": 0.335247256876597, "flos": 17231557150080.0, "grad_norm": 6.901726321552094, "language_loss": 0.82656181, "learning_rate": 2.9899490273177327e-06, "loss": 0.8476879, "num_input_tokens_seen": 119769195, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.5546875, "step": 5576, "time_per_iteration": 2.358658790588379 }, { "auxiliary_loss_clip": 0.0107583, "auxiliary_loss_mlp": 0.01031376, "balance_loss_clip": 1.01499701, "balance_loss_mlp": 1.02272081, "epoch": 0.335307380129265, "flos": 25628203376640.0, "grad_norm": 2.250754527870607, "language_loss": 0.72949106, "learning_rate": 2.9896207170040084e-06, "loss": 0.75056314, "num_input_tokens_seen": 119786810, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.53125, "step": 5577, "time_per_iteration": 2.4029417037963867 }, { "auxiliary_loss_clip": 0.01078041, "auxiliary_loss_mlp": 0.01029663, "balance_loss_clip": 1.01322389, "balance_loss_mlp": 1.02676105, "epoch": 0.33536750338193294, "flos": 19679143080960.0, "grad_norm": 1.8074639118780211, "language_loss": 0.81616819, "learning_rate": 2.989292371373266e-06, "loss": 0.83724523, "num_input_tokens_seen": 119805395, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.51171875, "step": 5578, "time_per_iteration": 2.389028310775757 }, { "auxiliary_loss_clip": 0.01074314, "auxiliary_loss_mlp": 0.01028994, "balance_loss_clip": 1.01451552, "balance_loss_mlp": 1.0248735, "epoch": 0.3354276266346009, "flos": 18331635179520.0, "grad_norm": 1.6912751610589036, "language_loss": 0.71834564, "learning_rate": 2.9889639904372246e-06, "loss": 0.73937869, "num_input_tokens_seen": 119823135, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.49414062, "step": 5579, "time_per_iteration": 2.3517847061157227 }, { "auxiliary_loss_clip": 0.01079201, "auxiliary_loss_mlp": 0.01032052, "balance_loss_clip": 1.01599455, "balance_loss_mlp": 1.02580476, "epoch": 0.3354877498872689, "flos": 17857584472320.0, "grad_norm": 2.3857803422952277, "language_loss": 0.81320035, "learning_rate": 2.988635574207602e-06, "loss": 0.8343128, "num_input_tokens_seen": 119842265, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.53125, "step": 5580, "time_per_iteration": 2.383861780166626 }, { "auxiliary_loss_clip": 0.01077778, "auxiliary_loss_mlp": 0.0102849, "balance_loss_clip": 1.01293325, "balance_loss_mlp": 1.02508652, "epoch": 0.3355478731399369, "flos": 24716586199680.0, "grad_norm": 2.567139607894021, "language_loss": 0.77625459, "learning_rate": 2.988307122696119e-06, "loss": 0.79731727, "num_input_tokens_seen": 119862500, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.52734375, "step": 5581, "time_per_iteration": 2.416560649871826 }, { "auxiliary_loss_clip": 0.01081128, "auxiliary_loss_mlp": 0.01034889, "balance_loss_clip": 1.0163753, "balance_loss_mlp": 1.02548647, "epoch": 0.33560799639260486, "flos": 16872928997760.0, "grad_norm": 2.4729237608761654, "language_loss": 0.74867547, "learning_rate": 2.9879786359144967e-06, "loss": 0.76983559, "num_input_tokens_seen": 119880160, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.5546875, "step": 5582, "time_per_iteration": 2.352843999862671 }, { "auxiliary_loss_clip": 0.01075789, "auxiliary_loss_mlp": 0.01032979, "balance_loss_clip": 1.01754141, "balance_loss_mlp": 1.02315283, "epoch": 0.3356681196452728, "flos": 18332507963520.0, "grad_norm": 1.619946468347716, "language_loss": 0.82264602, "learning_rate": 2.9876501138744577e-06, "loss": 0.84373367, "num_input_tokens_seen": 119899040, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.52734375, "step": 5583, "time_per_iteration": 2.373098611831665 }, { "auxiliary_loss_clip": 0.01077088, "auxiliary_loss_mlp": 0.01034813, "balance_loss_clip": 1.01961994, "balance_loss_mlp": 1.02562809, "epoch": 0.3357282428979408, "flos": 34749192913920.0, "grad_norm": 2.1014313550016785, "language_loss": 0.77570271, "learning_rate": 2.9873215565877274e-06, "loss": 0.79682177, "num_input_tokens_seen": 119921120, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.515625, "step": 5584, "time_per_iteration": 2.4951212406158447 }, { "auxiliary_loss_clip": 0.01077104, "auxiliary_loss_mlp": 0.01031395, "balance_loss_clip": 1.01571941, "balance_loss_mlp": 1.02345395, "epoch": 0.33578836615060875, "flos": 21579011602560.0, "grad_norm": 2.3506498722560765, "language_loss": 0.76042569, "learning_rate": 2.9869929640660303e-06, "loss": 0.78151071, "num_input_tokens_seen": 119940165, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.5390625, "step": 5585, "time_per_iteration": 2.3909871578216553 }, { "auxiliary_loss_clip": 0.0107393, "auxiliary_loss_mlp": 0.01029624, "balance_loss_clip": 1.01537824, "balance_loss_mlp": 1.023453, "epoch": 0.3358484894032767, "flos": 24529277422080.0, "grad_norm": 1.4681236171573067, "language_loss": 0.77592355, "learning_rate": 2.9866643363210928e-06, "loss": 0.79695916, "num_input_tokens_seen": 119959730, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.50390625, "step": 5586, "time_per_iteration": 3.894225597381592 }, { "auxiliary_loss_clip": 0.01080874, "auxiliary_loss_mlp": 0.01038859, "balance_loss_clip": 1.02164531, "balance_loss_mlp": 1.02675009, "epoch": 0.3359086126559447, "flos": 22454493655680.0, "grad_norm": 2.1479307669552004, "language_loss": 0.80926239, "learning_rate": 2.9863356733646437e-06, "loss": 0.83045971, "num_input_tokens_seen": 119979315, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.5390625, "step": 5587, "time_per_iteration": 2.38244366645813 }, { "auxiliary_loss_clip": 0.01073225, "auxiliary_loss_mlp": 0.01032402, "balance_loss_clip": 1.01838279, "balance_loss_mlp": 1.02423763, "epoch": 0.33596873590861265, "flos": 16542790381440.0, "grad_norm": 1.9372760399643874, "language_loss": 0.66947579, "learning_rate": 2.9860069752084115e-06, "loss": 0.69053209, "num_input_tokens_seen": 119996140, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.49023438, "step": 5588, "time_per_iteration": 2.36731219291687 }, { "auxiliary_loss_clip": 0.01076421, "auxiliary_loss_mlp": 0.01041194, "balance_loss_clip": 1.02563763, "balance_loss_mlp": 1.0249002, "epoch": 0.3360288591612806, "flos": 31174470783360.0, "grad_norm": 1.9303552909992168, "language_loss": 0.69827455, "learning_rate": 2.985678241864126e-06, "loss": 0.71945071, "num_input_tokens_seen": 120017720, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.515625, "step": 5589, "time_per_iteration": 2.453322410583496 }, { "auxiliary_loss_clip": 0.01075426, "auxiliary_loss_mlp": 0.0103508, "balance_loss_clip": 1.01866531, "balance_loss_mlp": 1.02233672, "epoch": 0.3360889824139486, "flos": 23695760689920.0, "grad_norm": 1.6541257405760252, "language_loss": 0.6743663, "learning_rate": 2.9853494733435204e-06, "loss": 0.69547141, "num_input_tokens_seen": 120036335, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.53125, "step": 5590, "time_per_iteration": 3.856215715408325 }, { "auxiliary_loss_clip": 0.01072904, "auxiliary_loss_mlp": 0.01035295, "balance_loss_clip": 1.01961875, "balance_loss_mlp": 1.02358758, "epoch": 0.33614910566661654, "flos": 19317093615360.0, "grad_norm": 2.368900515365514, "language_loss": 0.73454851, "learning_rate": 2.985020669658326e-06, "loss": 0.75563049, "num_input_tokens_seen": 120056120, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.49414062, "step": 5591, "time_per_iteration": 2.393751621246338 }, { "auxiliary_loss_clip": 0.01075056, "auxiliary_loss_mlp": 0.0103177, "balance_loss_clip": 1.01710773, "balance_loss_mlp": 1.02314603, "epoch": 0.3362092289192845, "flos": 16471323095040.0, "grad_norm": 2.0580623479939204, "language_loss": 0.69646275, "learning_rate": 2.984691830820278e-06, "loss": 0.71753097, "num_input_tokens_seen": 120073650, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.515625, "step": 5592, "time_per_iteration": 2.3774869441986084 }, { "auxiliary_loss_clip": 0.01071907, "auxiliary_loss_mlp": 0.0103073, "balance_loss_clip": 1.01660371, "balance_loss_mlp": 1.02291036, "epoch": 0.33626935217195253, "flos": 24242430758400.0, "grad_norm": 2.658665761255701, "language_loss": 0.76120383, "learning_rate": 2.9843629568411114e-06, "loss": 0.78223014, "num_input_tokens_seen": 120093260, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.48828125, "step": 5593, "time_per_iteration": 2.41409969329834 }, { "auxiliary_loss_clip": 0.01074446, "auxiliary_loss_mlp": 0.01033657, "balance_loss_clip": 1.016348, "balance_loss_mlp": 1.02114677, "epoch": 0.3363294754246205, "flos": 19717756911360.0, "grad_norm": 1.9364391321455898, "language_loss": 0.71527827, "learning_rate": 2.984034047732563e-06, "loss": 0.73635936, "num_input_tokens_seen": 120111830, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.53125, "step": 5594, "time_per_iteration": 2.393780469894409 }, { "auxiliary_loss_clip": 0.01078368, "auxiliary_loss_mlp": 0.01031649, "balance_loss_clip": 1.01581812, "balance_loss_mlp": 1.02515757, "epoch": 0.33638959867728846, "flos": 22595333546880.0, "grad_norm": 4.823768842939805, "language_loss": 0.80069888, "learning_rate": 2.983705103506371e-06, "loss": 0.82179904, "num_input_tokens_seen": 120130470, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.53125, "step": 5595, "time_per_iteration": 3.772585153579712 }, { "auxiliary_loss_clip": 0.0107574, "auxiliary_loss_mlp": 0.01029729, "balance_loss_clip": 1.01576948, "balance_loss_mlp": 1.02415037, "epoch": 0.3364497219299564, "flos": 20993727703680.0, "grad_norm": 3.183615868305529, "language_loss": 0.81332552, "learning_rate": 2.983376124174274e-06, "loss": 0.83438021, "num_input_tokens_seen": 120150735, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.515625, "step": 5596, "time_per_iteration": 2.4326045513153076 }, { "auxiliary_loss_clip": 0.01075532, "auxiliary_loss_mlp": 0.01026483, "balance_loss_clip": 1.01266718, "balance_loss_mlp": 1.02411282, "epoch": 0.3365098451826244, "flos": 25227435346560.0, "grad_norm": 1.6261324892571685, "language_loss": 0.75755507, "learning_rate": 2.9830471097480133e-06, "loss": 0.77857518, "num_input_tokens_seen": 120173230, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.51171875, "step": 5597, "time_per_iteration": 2.4832892417907715 }, { "auxiliary_loss_clip": 0.01074414, "auxiliary_loss_mlp": 0.01028109, "balance_loss_clip": 1.01399422, "balance_loss_mlp": 1.02418804, "epoch": 0.33656996843529235, "flos": 24570544515840.0, "grad_norm": 1.763349238154856, "language_loss": 0.78587317, "learning_rate": 2.982718060239329e-06, "loss": 0.80689836, "num_input_tokens_seen": 120191860, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.5, "step": 5598, "time_per_iteration": 2.4460082054138184 }, { "auxiliary_loss_clip": 0.01079467, "auxiliary_loss_mlp": 0.01030174, "balance_loss_clip": 1.01379418, "balance_loss_mlp": 1.02409256, "epoch": 0.3366300916879603, "flos": 44089436989440.0, "grad_norm": 6.318534761904872, "language_loss": 0.64851892, "learning_rate": 2.9823889756599652e-06, "loss": 0.66961539, "num_input_tokens_seen": 120219195, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.5546875, "step": 5599, "time_per_iteration": 2.5885472297668457 }, { "auxiliary_loss_clip": 0.01081874, "auxiliary_loss_mlp": 0.0103895, "balance_loss_clip": 1.02197492, "balance_loss_mlp": 1.02561009, "epoch": 0.3366902149406283, "flos": 13879057023360.0, "grad_norm": 3.913644062184387, "language_loss": 0.82308364, "learning_rate": 2.9820598560216653e-06, "loss": 0.84429187, "num_input_tokens_seen": 120232950, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.5625, "step": 5600, "time_per_iteration": 2.360757350921631 }, { "auxiliary_loss_clip": 0.01077675, "auxiliary_loss_mlp": 0.01035691, "balance_loss_clip": 1.0187397, "balance_loss_mlp": 1.02370954, "epoch": 0.33675033819329625, "flos": 16252173290880.0, "grad_norm": 2.5046643143532537, "language_loss": 0.83401078, "learning_rate": 2.9817307013361764e-06, "loss": 0.85514444, "num_input_tokens_seen": 120248865, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.5390625, "step": 5601, "time_per_iteration": 2.3644134998321533 }, { "auxiliary_loss_clip": 0.01076271, "auxiliary_loss_mlp": 0.01028827, "balance_loss_clip": 1.01521301, "balance_loss_mlp": 1.02712429, "epoch": 0.3368104614459642, "flos": 17054861425920.0, "grad_norm": 1.861699238392671, "language_loss": 0.83444321, "learning_rate": 2.9814015116152437e-06, "loss": 0.85549414, "num_input_tokens_seen": 120267820, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.4921875, "step": 5602, "time_per_iteration": 2.3646373748779297 }, { "auxiliary_loss_clip": 0.01077992, "auxiliary_loss_mlp": 0.01032738, "balance_loss_clip": 1.01659739, "balance_loss_mlp": 1.02583826, "epoch": 0.3368705846986322, "flos": 17857654295040.0, "grad_norm": 2.0427291698062997, "language_loss": 0.69899702, "learning_rate": 2.9810722868706154e-06, "loss": 0.72010434, "num_input_tokens_seen": 120286540, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.51953125, "step": 5603, "time_per_iteration": 2.3502278327941895 }, { "auxiliary_loss_clip": 0.01079199, "auxiliary_loss_mlp": 0.01030145, "balance_loss_clip": 1.01384962, "balance_loss_mlp": 1.0249238, "epoch": 0.33693070795130015, "flos": 22928404717440.0, "grad_norm": 1.4579416458193406, "language_loss": 0.83036739, "learning_rate": 2.980743027114041e-06, "loss": 0.85146081, "num_input_tokens_seen": 120307305, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.54296875, "step": 5604, "time_per_iteration": 2.3966259956359863 }, { "auxiliary_loss_clip": 0.01077618, "auxiliary_loss_mlp": 0.01028898, "balance_loss_clip": 1.01315093, "balance_loss_mlp": 1.02480912, "epoch": 0.3369908312039681, "flos": 22016368604160.0, "grad_norm": 1.3949106231684925, "language_loss": 0.73859751, "learning_rate": 2.98041373235727e-06, "loss": 0.75966263, "num_input_tokens_seen": 120327845, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.52734375, "step": 5605, "time_per_iteration": 2.3924612998962402 }, { "auxiliary_loss_clip": 0.01078442, "auxiliary_loss_mlp": 0.0103599, "balance_loss_clip": 1.01965797, "balance_loss_mlp": 1.02344894, "epoch": 0.33705095445663613, "flos": 11801166145920.0, "grad_norm": 2.4049308667503704, "language_loss": 0.83449692, "learning_rate": 2.980084402612056e-06, "loss": 0.85564131, "num_input_tokens_seen": 120343255, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.55078125, "step": 5606, "time_per_iteration": 2.351778984069824 }, { "auxiliary_loss_clip": 0.01074129, "auxiliary_loss_mlp": 0.01029338, "balance_loss_clip": 1.01455641, "balance_loss_mlp": 1.0226649, "epoch": 0.3371110777093041, "flos": 25045223627520.0, "grad_norm": 1.560513435668126, "language_loss": 0.67990649, "learning_rate": 2.97975503789015e-06, "loss": 0.70094109, "num_input_tokens_seen": 120361745, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.515625, "step": 5607, "time_per_iteration": 2.402966022491455 }, { "auxiliary_loss_clip": 0.01079023, "auxiliary_loss_mlp": 0.01029879, "balance_loss_clip": 1.0140481, "balance_loss_mlp": 1.02460265, "epoch": 0.33717120096197206, "flos": 26577805979520.0, "grad_norm": 2.5935135639872477, "language_loss": 0.70628291, "learning_rate": 2.979425638203307e-06, "loss": 0.72737193, "num_input_tokens_seen": 120380565, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.54296875, "step": 5608, "time_per_iteration": 2.417232036590576 }, { "auxiliary_loss_clip": 0.01077287, "auxiliary_loss_mlp": 0.01031477, "balance_loss_clip": 1.01711226, "balance_loss_mlp": 1.02548313, "epoch": 0.33723132421464, "flos": 15157646167680.0, "grad_norm": 1.9004754845051264, "language_loss": 0.79216373, "learning_rate": 2.9790962035632823e-06, "loss": 0.81325138, "num_input_tokens_seen": 120399235, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.515625, "step": 5609, "time_per_iteration": 2.3566930294036865 }, { "auxiliary_loss_clip": 0.01076582, "auxiliary_loss_mlp": 0.01036807, "balance_loss_clip": 1.0208807, "balance_loss_mlp": 1.02438807, "epoch": 0.337291447467308, "flos": 23435099412480.0, "grad_norm": 4.681624607450182, "language_loss": 0.82176632, "learning_rate": 2.978766733981833e-06, "loss": 0.84290016, "num_input_tokens_seen": 120420095, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.5234375, "step": 5610, "time_per_iteration": 2.4102094173431396 }, { "auxiliary_loss_clip": 0.01073582, "auxiliary_loss_mlp": 0.0103033, "balance_loss_clip": 1.01439166, "balance_loss_mlp": 1.0230006, "epoch": 0.33735157071997596, "flos": 17237212790400.0, "grad_norm": 2.080899212381112, "language_loss": 0.81895936, "learning_rate": 2.9784372294707165e-06, "loss": 0.83999848, "num_input_tokens_seen": 120437690, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.5078125, "step": 5611, "time_per_iteration": 2.347810745239258 }, { "auxiliary_loss_clip": 0.01078331, "auxiliary_loss_mlp": 0.01036802, "balance_loss_clip": 1.01898026, "balance_loss_mlp": 1.0247165, "epoch": 0.3374116939726439, "flos": 28256115813120.0, "grad_norm": 1.6169619410858138, "language_loss": 0.79374301, "learning_rate": 2.9781076900416923e-06, "loss": 0.81489432, "num_input_tokens_seen": 120459240, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.53515625, "step": 5612, "time_per_iteration": 2.436657667160034 }, { "auxiliary_loss_clip": 0.01075045, "auxiliary_loss_mlp": 0.01030121, "balance_loss_clip": 1.01448071, "balance_loss_mlp": 1.02245164, "epoch": 0.3374718172253119, "flos": 35917910409600.0, "grad_norm": 3.1761571225599345, "language_loss": 0.69668424, "learning_rate": 2.97777811570652e-06, "loss": 0.71773589, "num_input_tokens_seen": 120481090, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.5234375, "step": 5613, "time_per_iteration": 2.487042188644409 }, { "auxiliary_loss_clip": 0.01076619, "auxiliary_loss_mlp": 0.01032591, "balance_loss_clip": 1.01573491, "balance_loss_mlp": 1.02403438, "epoch": 0.33753194047797985, "flos": 18185698229760.0, "grad_norm": 4.249750231575481, "language_loss": 0.7980001, "learning_rate": 2.977448506476962e-06, "loss": 0.81909221, "num_input_tokens_seen": 120500045, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.52734375, "step": 5614, "time_per_iteration": 2.379701614379883 }, { "auxiliary_loss_clip": 0.0107648, "auxiliary_loss_mlp": 0.0103566, "balance_loss_clip": 1.01924527, "balance_loss_mlp": 1.02390707, "epoch": 0.3375920637306478, "flos": 23147798901120.0, "grad_norm": 1.6485986299045274, "language_loss": 0.90878582, "learning_rate": 2.977118862364781e-06, "loss": 0.9299072, "num_input_tokens_seen": 120521125, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.52734375, "step": 5615, "time_per_iteration": 2.3953943252563477 }, { "auxiliary_loss_clip": 0.01075264, "auxiliary_loss_mlp": 0.01029163, "balance_loss_clip": 1.01466739, "balance_loss_mlp": 1.02394128, "epoch": 0.3376521869833158, "flos": 23111105195520.0, "grad_norm": 2.526942021719091, "language_loss": 0.81231046, "learning_rate": 2.9767891833817424e-06, "loss": 0.83335471, "num_input_tokens_seen": 120539180, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.51171875, "step": 5616, "time_per_iteration": 2.390759229660034 }, { "auxiliary_loss_clip": 0.01078023, "auxiliary_loss_mlp": 0.01033462, "balance_loss_clip": 1.01626003, "balance_loss_mlp": 1.02376103, "epoch": 0.33771231023598375, "flos": 19273766751360.0, "grad_norm": 2.126146631811231, "language_loss": 0.83726001, "learning_rate": 2.976459469539609e-06, "loss": 0.85837483, "num_input_tokens_seen": 120556280, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.54296875, "step": 5617, "time_per_iteration": 2.3498964309692383 }, { "auxiliary_loss_clip": 0.01076682, "auxiliary_loss_mlp": 0.01032118, "balance_loss_clip": 1.01750875, "balance_loss_mlp": 1.02386653, "epoch": 0.3377724334886517, "flos": 18149213992320.0, "grad_norm": 1.3591091061291356, "language_loss": 0.80304039, "learning_rate": 2.97612972085015e-06, "loss": 0.82412839, "num_input_tokens_seen": 120575395, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.52734375, "step": 5618, "time_per_iteration": 2.3753347396850586 }, { "auxiliary_loss_clip": 0.01076327, "auxiliary_loss_mlp": 0.01029588, "balance_loss_clip": 1.01401925, "balance_loss_mlp": 1.02267492, "epoch": 0.3378325567413197, "flos": 25774803642240.0, "grad_norm": 3.1814983270588586, "language_loss": 0.70787764, "learning_rate": 2.9757999373251315e-06, "loss": 0.72893679, "num_input_tokens_seen": 120596075, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.53515625, "step": 5619, "time_per_iteration": 2.4107887744903564 }, { "auxiliary_loss_clip": 0.0107475, "auxiliary_loss_mlp": 0.01036776, "balance_loss_clip": 1.02047396, "balance_loss_mlp": 1.02219379, "epoch": 0.3378926799939877, "flos": 21316255643520.0, "grad_norm": 2.647308501402637, "language_loss": 0.69699872, "learning_rate": 2.9754701189763236e-06, "loss": 0.71811402, "num_input_tokens_seen": 120614195, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.5234375, "step": 5620, "time_per_iteration": 2.39211368560791 }, { "auxiliary_loss_clip": 0.01078578, "auxiliary_loss_mlp": 0.01031369, "balance_loss_clip": 1.01637197, "balance_loss_mlp": 1.02622032, "epoch": 0.33795280324665566, "flos": 24898867741440.0, "grad_norm": 1.6244545301680078, "language_loss": 0.67200458, "learning_rate": 2.975140265815496e-06, "loss": 0.69310403, "num_input_tokens_seen": 120634475, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.5234375, "step": 5621, "time_per_iteration": 2.4176483154296875 }, { "auxiliary_loss_clip": 0.01073725, "auxiliary_loss_mlp": 0.01031109, "balance_loss_clip": 1.01666045, "balance_loss_mlp": 1.02346575, "epoch": 0.33801292649932363, "flos": 24752791146240.0, "grad_norm": 1.8086469645672796, "language_loss": 0.82557905, "learning_rate": 2.9748103778544213e-06, "loss": 0.84662735, "num_input_tokens_seen": 120654980, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.50390625, "step": 5622, "time_per_iteration": 2.4154491424560547 }, { "auxiliary_loss_clip": 0.01074326, "auxiliary_loss_mlp": 0.01031873, "balance_loss_clip": 1.01735902, "balance_loss_mlp": 1.02285719, "epoch": 0.3380730497519916, "flos": 26722765411200.0, "grad_norm": 1.4356041179159156, "language_loss": 0.7320528, "learning_rate": 2.974480455104871e-06, "loss": 0.75311476, "num_input_tokens_seen": 120676245, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.515625, "step": 5623, "time_per_iteration": 2.4266836643218994 }, { "auxiliary_loss_clip": 0.0101272, "auxiliary_loss_mlp": 0.01003525, "balance_loss_clip": 1.00199938, "balance_loss_mlp": 1.00198269, "epoch": 0.33813317300465956, "flos": 70032241560960.0, "grad_norm": 0.7436209765602, "language_loss": 0.54921335, "learning_rate": 2.9741504975786206e-06, "loss": 0.56937581, "num_input_tokens_seen": 120741965, "router_z_loss_clip": 0.01525879, "router_z_loss_mlp": 0.10742188, "step": 5624, "time_per_iteration": 3.153130054473877 }, { "auxiliary_loss_clip": 0.01077752, "auxiliary_loss_mlp": 0.01037399, "balance_loss_clip": 1.02203345, "balance_loss_mlp": 1.02473402, "epoch": 0.3381932962573275, "flos": 24096179606400.0, "grad_norm": 2.549702212202259, "language_loss": 0.72598791, "learning_rate": 2.9738205052874444e-06, "loss": 0.74713939, "num_input_tokens_seen": 120760410, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.53125, "step": 5625, "time_per_iteration": 2.3772730827331543 }, { "auxiliary_loss_clip": 0.01076543, "auxiliary_loss_mlp": 0.01030958, "balance_loss_clip": 1.01548493, "balance_loss_mlp": 1.02353954, "epoch": 0.3382534195099955, "flos": 19277327710080.0, "grad_norm": 4.643797502931981, "language_loss": 0.70201731, "learning_rate": 2.9734904782431196e-06, "loss": 0.72309232, "num_input_tokens_seen": 120777705, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.53125, "step": 5626, "time_per_iteration": 3.8215889930725098 }, { "auxiliary_loss_clip": 0.01073943, "auxiliary_loss_mlp": 0.01030701, "balance_loss_clip": 1.01525116, "balance_loss_mlp": 1.0228622, "epoch": 0.33831354276266346, "flos": 25225131196800.0, "grad_norm": 1.6269249559112355, "language_loss": 0.81360829, "learning_rate": 2.973160416457423e-06, "loss": 0.83465469, "num_input_tokens_seen": 120798660, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.51171875, "step": 5627, "time_per_iteration": 2.4187004566192627 }, { "auxiliary_loss_clip": 0.01081669, "auxiliary_loss_mlp": 0.01037101, "balance_loss_clip": 1.02085328, "balance_loss_mlp": 1.02671492, "epoch": 0.3383736660153314, "flos": 23110895727360.0, "grad_norm": 2.3035341468533828, "language_loss": 0.80695158, "learning_rate": 2.9728303199421354e-06, "loss": 0.82813925, "num_input_tokens_seen": 120816705, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.55078125, "step": 5628, "time_per_iteration": 2.369699001312256 }, { "auxiliary_loss_clip": 0.01075366, "auxiliary_loss_mlp": 0.01031168, "balance_loss_clip": 1.01518178, "balance_loss_mlp": 1.02260685, "epoch": 0.3384337892679994, "flos": 23476017392640.0, "grad_norm": 1.9356229428978402, "language_loss": 0.76929748, "learning_rate": 2.9725001887090358e-06, "loss": 0.79036283, "num_input_tokens_seen": 120835375, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.52734375, "step": 5629, "time_per_iteration": 2.383164167404175 }, { "auxiliary_loss_clip": 0.01076709, "auxiliary_loss_mlp": 0.0103251, "balance_loss_clip": 1.01631522, "balance_loss_mlp": 1.02310622, "epoch": 0.33849391252066735, "flos": 19424835671040.0, "grad_norm": 1.8089866933099867, "language_loss": 0.84739345, "learning_rate": 2.9721700227699055e-06, "loss": 0.86848569, "num_input_tokens_seen": 120854260, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.53515625, "step": 5630, "time_per_iteration": 3.8142900466918945 }, { "auxiliary_loss_clip": 0.01076486, "auxiliary_loss_mlp": 0.01029818, "balance_loss_clip": 1.01495266, "balance_loss_mlp": 1.02372074, "epoch": 0.3385540357733353, "flos": 21063903269760.0, "grad_norm": 2.744936973249887, "language_loss": 0.71665031, "learning_rate": 2.9718398221365285e-06, "loss": 0.73771334, "num_input_tokens_seen": 120871590, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.52734375, "step": 5631, "time_per_iteration": 2.3694472312927246 }, { "auxiliary_loss_clip": 0.01012176, "auxiliary_loss_mlp": 0.01004933, "balance_loss_clip": 1.00341284, "balance_loss_mlp": 1.00143838, "epoch": 0.3386141590260033, "flos": 69205220208000.0, "grad_norm": 0.8448382363837306, "language_loss": 0.56190181, "learning_rate": 2.9715095868206874e-06, "loss": 0.58207297, "num_input_tokens_seen": 120925550, "router_z_loss_clip": 0.01519775, "router_z_loss_mlp": 0.10742188, "step": 5632, "time_per_iteration": 3.065547227859497 }, { "auxiliary_loss_clip": 0.01075578, "auxiliary_loss_mlp": 0.01028533, "balance_loss_clip": 1.01295805, "balance_loss_mlp": 1.02369523, "epoch": 0.3386742822786713, "flos": 25518331728000.0, "grad_norm": 1.519673194388011, "language_loss": 0.80069363, "learning_rate": 2.9711793168341686e-06, "loss": 0.82173479, "num_input_tokens_seen": 120947620, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.51953125, "step": 5633, "time_per_iteration": 2.4332261085510254 }, { "auxiliary_loss_clip": 0.01074967, "auxiliary_loss_mlp": 0.01029958, "balance_loss_clip": 1.01499772, "balance_loss_mlp": 1.02392209, "epoch": 0.33873440553133927, "flos": 23621989253760.0, "grad_norm": 1.8328984334662508, "language_loss": 0.592664, "learning_rate": 2.9708490121887587e-06, "loss": 0.61371326, "num_input_tokens_seen": 120965205, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.51171875, "step": 5634, "time_per_iteration": 2.3915348052978516 }, { "auxiliary_loss_clip": 0.0107395, "auxiliary_loss_mlp": 0.01030309, "balance_loss_clip": 1.01555061, "balance_loss_mlp": 1.02264261, "epoch": 0.33879452878400723, "flos": 17088029084160.0, "grad_norm": 2.1240742698835064, "language_loss": 0.92571288, "learning_rate": 2.9705186728962436e-06, "loss": 0.94675547, "num_input_tokens_seen": 120983560, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.51171875, "step": 5635, "time_per_iteration": 3.7240724563598633 }, { "auxiliary_loss_clip": 0.01074799, "auxiliary_loss_mlp": 0.0102773, "balance_loss_clip": 1.01378798, "balance_loss_mlp": 1.02493238, "epoch": 0.3388546520366752, "flos": 15741149587200.0, "grad_norm": 2.8554016811383764, "language_loss": 0.75170875, "learning_rate": 2.9701882989684145e-06, "loss": 0.77273405, "num_input_tokens_seen": 121001400, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.5, "step": 5636, "time_per_iteration": 2.3523852825164795 }, { "auxiliary_loss_clip": 0.0107459, "auxiliary_loss_mlp": 0.01033708, "balance_loss_clip": 1.01800776, "balance_loss_mlp": 1.02363765, "epoch": 0.33891477528934316, "flos": 22417660304640.0, "grad_norm": 1.5282647174407973, "language_loss": 0.83106989, "learning_rate": 2.96985789041706e-06, "loss": 0.85215294, "num_input_tokens_seen": 121021760, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.5078125, "step": 5637, "time_per_iteration": 2.402747392654419 }, { "auxiliary_loss_clip": 0.0107723, "auxiliary_loss_mlp": 0.01035413, "balance_loss_clip": 1.01957023, "balance_loss_mlp": 1.02447546, "epoch": 0.3389748985420111, "flos": 17273871584640.0, "grad_norm": 2.1993904029405082, "language_loss": 0.69675529, "learning_rate": 2.9695274472539725e-06, "loss": 0.71788174, "num_input_tokens_seen": 121041070, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.52734375, "step": 5638, "time_per_iteration": 2.3540613651275635 }, { "auxiliary_loss_clip": 0.01076904, "auxiliary_loss_mlp": 0.01038821, "balance_loss_clip": 1.02290702, "balance_loss_mlp": 1.02506208, "epoch": 0.3390350217946791, "flos": 27743765477760.0, "grad_norm": 1.695284926964946, "language_loss": 0.80802101, "learning_rate": 2.9691969694909443e-06, "loss": 0.82917827, "num_input_tokens_seen": 121060890, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.51953125, "step": 5639, "time_per_iteration": 2.4445531368255615 }, { "auxiliary_loss_clip": 0.01078085, "auxiliary_loss_mlp": 0.01039751, "balance_loss_clip": 1.02337193, "balance_loss_mlp": 1.02449226, "epoch": 0.33909514504734706, "flos": 20338756997760.0, "grad_norm": 2.689527476927121, "language_loss": 0.67899245, "learning_rate": 2.9688664571397696e-06, "loss": 0.70017081, "num_input_tokens_seen": 121079135, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.53515625, "step": 5640, "time_per_iteration": 2.36692214012146 }, { "auxiliary_loss_clip": 0.01075868, "auxiliary_loss_mlp": 0.01036855, "balance_loss_clip": 1.021137, "balance_loss_mlp": 1.02434814, "epoch": 0.339155268300015, "flos": 14829148385280.0, "grad_norm": 1.674574474960228, "language_loss": 0.69692838, "learning_rate": 2.9685359102122432e-06, "loss": 0.71805567, "num_input_tokens_seen": 121097685, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.515625, "step": 5641, "time_per_iteration": 2.367981195449829 }, { "auxiliary_loss_clip": 0.01077071, "auxiliary_loss_mlp": 0.01037029, "balance_loss_clip": 1.02202082, "balance_loss_mlp": 1.02441025, "epoch": 0.339215391552683, "flos": 26066747364480.0, "grad_norm": 1.7932676903481128, "language_loss": 0.8727544, "learning_rate": 2.9682053287201615e-06, "loss": 0.89389545, "num_input_tokens_seen": 121115640, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.52734375, "step": 5642, "time_per_iteration": 2.408924102783203 }, { "auxiliary_loss_clip": 0.01070217, "auxiliary_loss_mlp": 0.01027118, "balance_loss_clip": 1.0143925, "balance_loss_mlp": 1.02345514, "epoch": 0.33927551480535095, "flos": 14573828545920.0, "grad_norm": 2.438411095618502, "language_loss": 0.84156924, "learning_rate": 2.967874712675322e-06, "loss": 0.86254263, "num_input_tokens_seen": 121132485, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.46875, "step": 5643, "time_per_iteration": 2.378606081008911 }, { "auxiliary_loss_clip": 0.01076258, "auxiliary_loss_mlp": 0.01039051, "balance_loss_clip": 1.02366066, "balance_loss_mlp": 1.02433157, "epoch": 0.3393356380580189, "flos": 23804445352320.0, "grad_norm": 1.6008140968226678, "language_loss": 0.76782477, "learning_rate": 2.9675440620895233e-06, "loss": 0.78897786, "num_input_tokens_seen": 121152935, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.51953125, "step": 5644, "time_per_iteration": 2.3966500759124756 }, { "auxiliary_loss_clip": 0.0107423, "auxiliary_loss_mlp": 0.01038527, "balance_loss_clip": 1.02316117, "balance_loss_mlp": 1.02269149, "epoch": 0.3393957613106869, "flos": 17346909882240.0, "grad_norm": 4.295219821805532, "language_loss": 0.62777728, "learning_rate": 2.9672133769745664e-06, "loss": 0.6489048, "num_input_tokens_seen": 121169835, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.515625, "step": 5645, "time_per_iteration": 2.369154930114746 }, { "auxiliary_loss_clip": 0.0107192, "auxiliary_loss_mlp": 0.01025699, "balance_loss_clip": 1.01165605, "balance_loss_mlp": 1.02285695, "epoch": 0.3394558845633549, "flos": 28432846448640.0, "grad_norm": 1.8700672525926678, "language_loss": 0.76823199, "learning_rate": 2.966882657342252e-06, "loss": 0.78920817, "num_input_tokens_seen": 121190290, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.49023438, "step": 5646, "time_per_iteration": 2.427600145339966 }, { "auxiliary_loss_clip": 0.01077248, "auxiliary_loss_mlp": 0.01034818, "balance_loss_clip": 1.0186888, "balance_loss_mlp": 1.02365959, "epoch": 0.33951600781602287, "flos": 22085950677120.0, "grad_norm": 2.0542152627864807, "language_loss": 0.78814727, "learning_rate": 2.9665519032043825e-06, "loss": 0.80926788, "num_input_tokens_seen": 121209060, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.53515625, "step": 5647, "time_per_iteration": 2.3867404460906982 }, { "auxiliary_loss_clip": 0.01076388, "auxiliary_loss_mlp": 0.01026807, "balance_loss_clip": 1.01215613, "balance_loss_mlp": 1.02448642, "epoch": 0.33957613106869083, "flos": 23877134536320.0, "grad_norm": 2.481785087077141, "language_loss": 0.77033567, "learning_rate": 2.9662211145727618e-06, "loss": 0.79136753, "num_input_tokens_seen": 121227480, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.51953125, "step": 5648, "time_per_iteration": 2.3942325115203857 }, { "auxiliary_loss_clip": 0.01013995, "auxiliary_loss_mlp": 0.01006688, "balance_loss_clip": 1.00538874, "balance_loss_mlp": 1.00304985, "epoch": 0.3396362543213588, "flos": 71239014305280.0, "grad_norm": 0.774301705182907, "language_loss": 0.56301385, "learning_rate": 2.965890291459195e-06, "loss": 0.58322066, "num_input_tokens_seen": 121291305, "router_z_loss_clip": 0.01300049, "router_z_loss_mlp": 0.109375, "step": 5649, "time_per_iteration": 3.0583043098449707 }, { "auxiliary_loss_clip": 0.01074092, "auxiliary_loss_mlp": 0.01037205, "balance_loss_clip": 1.02161837, "balance_loss_mlp": 1.02277708, "epoch": 0.33969637757402676, "flos": 25920426389760.0, "grad_norm": 1.5210174824094578, "language_loss": 0.85373539, "learning_rate": 2.9655594338754887e-06, "loss": 0.87484837, "num_input_tokens_seen": 121312740, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.51171875, "step": 5650, "time_per_iteration": 2.435272216796875 }, { "auxiliary_loss_clip": 0.01072556, "auxiliary_loss_mlp": 0.01022831, "balance_loss_clip": 1.00887132, "balance_loss_mlp": 1.02285469, "epoch": 0.33975650082669473, "flos": 35260286440320.0, "grad_norm": 1.8909576118453764, "language_loss": 0.71000648, "learning_rate": 2.9652285418334496e-06, "loss": 0.73096031, "num_input_tokens_seen": 121334220, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.49609375, "step": 5651, "time_per_iteration": 2.5079214572906494 }, { "auxiliary_loss_clip": 0.010732, "auxiliary_loss_mlp": 0.0103013, "balance_loss_clip": 1.01514578, "balance_loss_mlp": 1.02314401, "epoch": 0.3398166240793627, "flos": 16646273251200.0, "grad_norm": 1.8315780166776139, "language_loss": 0.81215549, "learning_rate": 2.964897615344886e-06, "loss": 0.83318877, "num_input_tokens_seen": 121351870, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.5, "step": 5652, "time_per_iteration": 2.3635478019714355 }, { "auxiliary_loss_clip": 0.01077169, "auxiliary_loss_mlp": 0.01033078, "balance_loss_clip": 1.01691341, "balance_loss_mlp": 1.02482486, "epoch": 0.33987674733203066, "flos": 24061022000640.0, "grad_norm": 1.7951883801322637, "language_loss": 0.76595128, "learning_rate": 2.9645666544216097e-06, "loss": 0.7870537, "num_input_tokens_seen": 121373400, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.5234375, "step": 5653, "time_per_iteration": 2.417473554611206 }, { "auxiliary_loss_clip": 0.01076131, "auxiliary_loss_mlp": 0.01027669, "balance_loss_clip": 1.01310778, "balance_loss_mlp": 1.0246743, "epoch": 0.3399368705846986, "flos": 13250132058240.0, "grad_norm": 2.875885198306419, "language_loss": 0.86475319, "learning_rate": 2.9642356590754298e-06, "loss": 0.88579118, "num_input_tokens_seen": 121385225, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.515625, "step": 5654, "time_per_iteration": 2.3092591762542725 }, { "auxiliary_loss_clip": 0.01073479, "auxiliary_loss_mlp": 0.01028222, "balance_loss_clip": 1.01392317, "balance_loss_mlp": 1.02191973, "epoch": 0.3399969938373666, "flos": 27011706756480.0, "grad_norm": 2.428931871480884, "language_loss": 0.65184164, "learning_rate": 2.9639046293181603e-06, "loss": 0.67285866, "num_input_tokens_seen": 121404735, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.515625, "step": 5655, "time_per_iteration": 2.426403522491455 }, { "auxiliary_loss_clip": 0.01075061, "auxiliary_loss_mlp": 0.01028235, "balance_loss_clip": 1.01450205, "balance_loss_mlp": 1.02433681, "epoch": 0.34005711709003456, "flos": 28548792673920.0, "grad_norm": 1.4290279978617195, "language_loss": 0.76443708, "learning_rate": 2.963573565161613e-06, "loss": 0.78547001, "num_input_tokens_seen": 121426780, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.5078125, "step": 5656, "time_per_iteration": 2.440124988555908 }, { "auxiliary_loss_clip": 0.0107775, "auxiliary_loss_mlp": 0.01031176, "balance_loss_clip": 1.01574993, "balance_loss_mlp": 1.02358961, "epoch": 0.3401172403427025, "flos": 21615914776320.0, "grad_norm": 2.028324762550039, "language_loss": 0.8266331, "learning_rate": 2.963242466617605e-06, "loss": 0.84772229, "num_input_tokens_seen": 121447245, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.5390625, "step": 5657, "time_per_iteration": 2.4245548248291016 }, { "auxiliary_loss_clip": 0.01075183, "auxiliary_loss_mlp": 0.01026801, "balance_loss_clip": 1.01234102, "balance_loss_mlp": 1.024786, "epoch": 0.3401773635953705, "flos": 25884570556800.0, "grad_norm": 1.8957057597359546, "language_loss": 0.85232812, "learning_rate": 2.9629113336979505e-06, "loss": 0.87334794, "num_input_tokens_seen": 121468165, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.50390625, "step": 5658, "time_per_iteration": 2.4326703548431396 }, { "auxiliary_loss_clip": 0.01012248, "auxiliary_loss_mlp": 0.01002538, "balance_loss_clip": 1.00134003, "balance_loss_mlp": 1.00153422, "epoch": 0.3402374868480385, "flos": 65504704982400.0, "grad_norm": 0.8148447127203781, "language_loss": 0.59968007, "learning_rate": 2.962580166414467e-06, "loss": 0.61982793, "num_input_tokens_seen": 121523795, "router_z_loss_clip": 0.01196289, "router_z_loss_mlp": 0.10742188, "step": 5659, "time_per_iteration": 2.9893085956573486 }, { "auxiliary_loss_clip": 0.01074024, "auxiliary_loss_mlp": 0.01025055, "balance_loss_clip": 1.01110721, "balance_loss_mlp": 1.025442, "epoch": 0.34029761010070647, "flos": 24059450989440.0, "grad_norm": 1.7796851025341553, "language_loss": 0.6785512, "learning_rate": 2.9622489647789742e-06, "loss": 0.69954199, "num_input_tokens_seen": 121542950, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.484375, "step": 5660, "time_per_iteration": 2.403658866882324 }, { "auxiliary_loss_clip": 0.01078764, "auxiliary_loss_mlp": 0.01033478, "balance_loss_clip": 1.01771879, "balance_loss_mlp": 1.02695012, "epoch": 0.34035773335337444, "flos": 27598491843840.0, "grad_norm": 1.78381560535721, "language_loss": 0.6722554, "learning_rate": 2.9619177288032904e-06, "loss": 0.69337779, "num_input_tokens_seen": 121562765, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.51953125, "step": 5661, "time_per_iteration": 2.4223902225494385 }, { "auxiliary_loss_clip": 0.0107266, "auxiliary_loss_mlp": 0.01030514, "balance_loss_clip": 1.01619661, "balance_loss_mlp": 1.02369678, "epoch": 0.3404178566060424, "flos": 20811760364160.0, "grad_norm": 1.8030479031776563, "language_loss": 0.7913093, "learning_rate": 2.9615864584992374e-06, "loss": 0.81234097, "num_input_tokens_seen": 121581610, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.49023438, "step": 5662, "time_per_iteration": 2.37856388092041 }, { "auxiliary_loss_clip": 0.01075628, "auxiliary_loss_mlp": 0.01034125, "balance_loss_clip": 1.01933074, "balance_loss_mlp": 1.02419043, "epoch": 0.34047797985871037, "flos": 26832357768960.0, "grad_norm": 2.475650058666183, "language_loss": 0.73567003, "learning_rate": 2.961255153878637e-06, "loss": 0.75676757, "num_input_tokens_seen": 121601885, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.515625, "step": 5663, "time_per_iteration": 2.416792154312134 }, { "auxiliary_loss_clip": 0.01070499, "auxiliary_loss_mlp": 0.01027902, "balance_loss_clip": 1.01439524, "balance_loss_mlp": 1.02212024, "epoch": 0.34053810311137833, "flos": 19681621787520.0, "grad_norm": 1.612172269073878, "language_loss": 0.85902905, "learning_rate": 2.9609238149533132e-06, "loss": 0.88001305, "num_input_tokens_seen": 121621335, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.484375, "step": 5664, "time_per_iteration": 2.3576905727386475 }, { "auxiliary_loss_clip": 0.01074406, "auxiliary_loss_mlp": 0.01026329, "balance_loss_clip": 1.01238728, "balance_loss_mlp": 1.02412224, "epoch": 0.3405982263640463, "flos": 21724669261440.0, "grad_norm": 2.206593324272444, "language_loss": 0.69139558, "learning_rate": 2.9605924417350904e-06, "loss": 0.71240294, "num_input_tokens_seen": 121641310, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.5, "step": 5665, "time_per_iteration": 2.3931596279144287 }, { "auxiliary_loss_clip": 0.01074031, "auxiliary_loss_mlp": 0.01028627, "balance_loss_clip": 1.01261675, "balance_loss_mlp": 1.0229063, "epoch": 0.34065834961671426, "flos": 18040634064000.0, "grad_norm": 2.777823712143208, "language_loss": 0.72417438, "learning_rate": 2.960261034235794e-06, "loss": 0.74520099, "num_input_tokens_seen": 121659625, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.51171875, "step": 5666, "time_per_iteration": 3.715991497039795 }, { "auxiliary_loss_clip": 0.01075164, "auxiliary_loss_mlp": 0.01033404, "balance_loss_clip": 1.01895595, "balance_loss_mlp": 1.02328968, "epoch": 0.3407184728693822, "flos": 21396276213120.0, "grad_norm": 1.5275353853193336, "language_loss": 0.73051095, "learning_rate": 2.959929592467251e-06, "loss": 0.75159669, "num_input_tokens_seen": 121679205, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.51953125, "step": 5667, "time_per_iteration": 2.3947646617889404 }, { "auxiliary_loss_clip": 0.01012561, "auxiliary_loss_mlp": 0.0100436, "balance_loss_clip": 1.00307214, "balance_loss_mlp": 1.00175703, "epoch": 0.3407785961220502, "flos": 68684559102720.0, "grad_norm": 0.8794491710731555, "language_loss": 0.63289535, "learning_rate": 2.959598116441291e-06, "loss": 0.65306461, "num_input_tokens_seen": 121751085, "router_z_loss_clip": 0.01287842, "router_z_loss_mlp": 0.10791016, "step": 5668, "time_per_iteration": 3.17501163482666 }, { "auxiliary_loss_clip": 0.01075604, "auxiliary_loss_mlp": 0.01033663, "balance_loss_clip": 1.01920295, "balance_loss_mlp": 1.02472508, "epoch": 0.34083871937471816, "flos": 14063503069440.0, "grad_norm": 2.23530494272223, "language_loss": 0.72277373, "learning_rate": 2.959266606169741e-06, "loss": 0.74386638, "num_input_tokens_seen": 121768565, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.51171875, "step": 5669, "time_per_iteration": 5.303999423980713 }, { "auxiliary_loss_clip": 0.01077472, "auxiliary_loss_mlp": 0.01032144, "balance_loss_clip": 1.01693916, "balance_loss_mlp": 1.02426696, "epoch": 0.3408988426273861, "flos": 17084677593600.0, "grad_norm": 1.8950884689325542, "language_loss": 0.8054558, "learning_rate": 2.9589350616644353e-06, "loss": 0.82655203, "num_input_tokens_seen": 121784925, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.53125, "step": 5670, "time_per_iteration": 2.35649037361145 }, { "auxiliary_loss_clip": 0.01075323, "auxiliary_loss_mlp": 0.01032444, "balance_loss_clip": 1.01763189, "balance_loss_mlp": 1.02297938, "epoch": 0.3409589658800541, "flos": 24023420599680.0, "grad_norm": 1.6016969120455145, "language_loss": 0.77061087, "learning_rate": 2.9586034829372026e-06, "loss": 0.79168856, "num_input_tokens_seen": 121804425, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.5234375, "step": 5671, "time_per_iteration": 2.3982691764831543 }, { "auxiliary_loss_clip": 0.0107555, "auxiliary_loss_mlp": 0.01034369, "balance_loss_clip": 1.01825142, "balance_loss_mlp": 1.02337396, "epoch": 0.34101908913272205, "flos": 21140956373760.0, "grad_norm": 1.796762478931232, "language_loss": 0.74508572, "learning_rate": 2.958271869999878e-06, "loss": 0.76618481, "num_input_tokens_seen": 121825145, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.5234375, "step": 5672, "time_per_iteration": 2.4157934188842773 }, { "auxiliary_loss_clip": 0.01075326, "auxiliary_loss_mlp": 0.01029518, "balance_loss_clip": 1.01557052, "balance_loss_mlp": 1.02421045, "epoch": 0.3410792123853901, "flos": 15701209125120.0, "grad_norm": 3.570387713207652, "language_loss": 0.73293406, "learning_rate": 2.9579402228642956e-06, "loss": 0.75398248, "num_input_tokens_seen": 121842185, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.5078125, "step": 5673, "time_per_iteration": 2.3417811393737793 }, { "auxiliary_loss_clip": 0.01074595, "auxiliary_loss_mlp": 0.0102692, "balance_loss_clip": 1.01315713, "balance_loss_mlp": 1.02400386, "epoch": 0.34113933563805804, "flos": 23034994698240.0, "grad_norm": 1.963562023842391, "language_loss": 0.79760337, "learning_rate": 2.9576085415422902e-06, "loss": 0.81861854, "num_input_tokens_seen": 121862260, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.5078125, "step": 5674, "time_per_iteration": 3.866154193878174 }, { "auxiliary_loss_clip": 0.01013711, "auxiliary_loss_mlp": 0.01001131, "balance_loss_clip": 0.99987978, "balance_loss_mlp": 1.00293326, "epoch": 0.341199458890726, "flos": 69611294764800.0, "grad_norm": 0.8143198738572112, "language_loss": 0.56072396, "learning_rate": 2.957276826045699e-06, "loss": 0.58087242, "num_input_tokens_seen": 121923560, "router_z_loss_clip": 0.01251221, "router_z_loss_mlp": 0.10791016, "step": 5675, "time_per_iteration": 3.181320905685425 }, { "auxiliary_loss_clip": 0.0107496, "auxiliary_loss_mlp": 0.01032886, "balance_loss_clip": 1.01872945, "balance_loss_mlp": 1.02460063, "epoch": 0.34125958214339397, "flos": 22345250411520.0, "grad_norm": 1.5788252027311565, "language_loss": 0.78773433, "learning_rate": 2.9569450763863606e-06, "loss": 0.8088128, "num_input_tokens_seen": 121943515, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.50390625, "step": 5676, "time_per_iteration": 2.386732578277588 }, { "auxiliary_loss_clip": 0.01072465, "auxiliary_loss_mlp": 0.01025233, "balance_loss_clip": 1.00962305, "balance_loss_mlp": 1.02226472, "epoch": 0.34131970539606193, "flos": 21870850590720.0, "grad_norm": 1.7749481020339508, "language_loss": 0.85304117, "learning_rate": 2.9566132925761143e-06, "loss": 0.87401807, "num_input_tokens_seen": 121962540, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.5, "step": 5677, "time_per_iteration": 2.4253628253936768 }, { "auxiliary_loss_clip": 0.01072604, "auxiliary_loss_mlp": 0.01033403, "balance_loss_clip": 1.01908624, "balance_loss_mlp": 1.02298474, "epoch": 0.3413798286487299, "flos": 24934583928960.0, "grad_norm": 1.8824480524619214, "language_loss": 0.79324758, "learning_rate": 2.9562814746267996e-06, "loss": 0.81430763, "num_input_tokens_seen": 121979830, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.49609375, "step": 5678, "time_per_iteration": 2.4013874530792236 }, { "auxiliary_loss_clip": 0.01074352, "auxiliary_loss_mlp": 0.01028741, "balance_loss_clip": 1.01456094, "balance_loss_mlp": 1.02263176, "epoch": 0.34143995190139786, "flos": 25373197739520.0, "grad_norm": 1.742050798159555, "language_loss": 0.74857879, "learning_rate": 2.9559496225502594e-06, "loss": 0.76960969, "num_input_tokens_seen": 121999055, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.515625, "step": 5679, "time_per_iteration": 2.414229154586792 }, { "auxiliary_loss_clip": 0.010139, "auxiliary_loss_mlp": 0.01010534, "balance_loss_clip": 1.00940156, "balance_loss_mlp": 1.00296545, "epoch": 0.34150007515406583, "flos": 67778876856960.0, "grad_norm": 0.7177870234899756, "language_loss": 0.59463364, "learning_rate": 2.955617736358336e-06, "loss": 0.61487794, "num_input_tokens_seen": 122067015, "router_z_loss_clip": 0.01135254, "router_z_loss_mlp": 0.109375, "step": 5680, "time_per_iteration": 3.1156423091888428 }, { "auxiliary_loss_clip": 0.01073377, "auxiliary_loss_mlp": 0.01025167, "balance_loss_clip": 1.01211929, "balance_loss_mlp": 1.02435422, "epoch": 0.3415601984067338, "flos": 20301399976320.0, "grad_norm": 1.8957808482037422, "language_loss": 0.7218442, "learning_rate": 2.955285816062874e-06, "loss": 0.74282968, "num_input_tokens_seen": 122085295, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.49023438, "step": 5681, "time_per_iteration": 2.37471342086792 }, { "auxiliary_loss_clip": 0.01071708, "auxiliary_loss_mlp": 0.01032365, "balance_loss_clip": 1.01884079, "balance_loss_mlp": 1.02241039, "epoch": 0.34162032165940176, "flos": 26029983836160.0, "grad_norm": 2.011187576118092, "language_loss": 0.71353495, "learning_rate": 2.9549538616757183e-06, "loss": 0.73457569, "num_input_tokens_seen": 122104020, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.4921875, "step": 5682, "time_per_iteration": 2.421415328979492 }, { "auxiliary_loss_clip": 0.0107595, "auxiliary_loss_mlp": 0.01032842, "balance_loss_clip": 1.016886, "balance_loss_mlp": 1.02399075, "epoch": 0.3416804449120697, "flos": 28802087654400.0, "grad_norm": 1.6609579092068152, "language_loss": 0.84060884, "learning_rate": 2.9546218732087154e-06, "loss": 0.86169678, "num_input_tokens_seen": 122125080, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.51953125, "step": 5683, "time_per_iteration": 2.4393839836120605 }, { "auxiliary_loss_clip": 0.01078074, "auxiliary_loss_mlp": 0.01034073, "balance_loss_clip": 1.01802111, "balance_loss_mlp": 1.02507877, "epoch": 0.3417405681647377, "flos": 22600500428160.0, "grad_norm": 2.5783638337989148, "language_loss": 0.70450306, "learning_rate": 2.9542898506737135e-06, "loss": 0.72562456, "num_input_tokens_seen": 122146350, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.53125, "step": 5684, "time_per_iteration": 2.4221436977386475 }, { "auxiliary_loss_clip": 0.0107283, "auxiliary_loss_mlp": 0.01030206, "balance_loss_clip": 1.01708674, "balance_loss_mlp": 1.02378631, "epoch": 0.34180069141740566, "flos": 24715119922560.0, "grad_norm": 1.3735198308248453, "language_loss": 0.74791551, "learning_rate": 2.953957794082562e-06, "loss": 0.76894587, "num_input_tokens_seen": 122168085, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.49023438, "step": 5685, "time_per_iteration": 2.4222681522369385 }, { "auxiliary_loss_clip": 0.01073828, "auxiliary_loss_mlp": 0.01030782, "balance_loss_clip": 1.01708531, "balance_loss_mlp": 1.02382946, "epoch": 0.3418608146700737, "flos": 30517440307200.0, "grad_norm": 1.8220409488685603, "language_loss": 0.70073551, "learning_rate": 2.9536257034471107e-06, "loss": 0.72178161, "num_input_tokens_seen": 122191040, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.5, "step": 5686, "time_per_iteration": 2.4573311805725098 }, { "auxiliary_loss_clip": 0.010748, "auxiliary_loss_mlp": 0.01031809, "balance_loss_clip": 1.016294, "balance_loss_mlp": 1.02225232, "epoch": 0.34192093792274164, "flos": 15121441221120.0, "grad_norm": 2.1157254569466764, "language_loss": 0.77686048, "learning_rate": 2.9532935787792114e-06, "loss": 0.79792655, "num_input_tokens_seen": 122209225, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.5234375, "step": 5687, "time_per_iteration": 2.367379903793335 }, { "auxiliary_loss_clip": 0.01075551, "auxiliary_loss_mlp": 0.01031559, "balance_loss_clip": 1.01678872, "balance_loss_mlp": 1.02577603, "epoch": 0.3419810611754096, "flos": 13186973675520.0, "grad_norm": 2.5256990769760366, "language_loss": 0.8633939, "learning_rate": 2.9529614200907157e-06, "loss": 0.88446498, "num_input_tokens_seen": 122226160, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.49804688, "step": 5688, "time_per_iteration": 2.3486032485961914 }, { "auxiliary_loss_clip": 0.01080261, "auxiliary_loss_mlp": 0.01031542, "balance_loss_clip": 1.01519239, "balance_loss_mlp": 1.02515614, "epoch": 0.34204118442807757, "flos": 19535265901440.0, "grad_norm": 6.925714096368138, "language_loss": 0.79477704, "learning_rate": 2.9526292273934787e-06, "loss": 0.81589508, "num_input_tokens_seen": 122243115, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.55078125, "step": 5689, "time_per_iteration": 2.367412805557251 }, { "auxiliary_loss_clip": 0.01074823, "auxiliary_loss_mlp": 0.01031888, "balance_loss_clip": 1.01592612, "balance_loss_mlp": 1.02324164, "epoch": 0.34210130768074554, "flos": 15193955848320.0, "grad_norm": 2.095379269535975, "language_loss": 0.73551762, "learning_rate": 2.9522970006993547e-06, "loss": 0.7565847, "num_input_tokens_seen": 122261105, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.515625, "step": 5690, "time_per_iteration": 2.3497183322906494 }, { "auxiliary_loss_clip": 0.01074195, "auxiliary_loss_mlp": 0.01026784, "balance_loss_clip": 1.01299095, "balance_loss_mlp": 1.02267826, "epoch": 0.3421614309334135, "flos": 24935072688000.0, "grad_norm": 2.5310555944899122, "language_loss": 0.75868571, "learning_rate": 2.9519647400202003e-06, "loss": 0.77969539, "num_input_tokens_seen": 122279995, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.515625, "step": 5691, "time_per_iteration": 2.4062180519104004 }, { "auxiliary_loss_clip": 0.01072996, "auxiliary_loss_mlp": 0.01028443, "balance_loss_clip": 1.01408434, "balance_loss_mlp": 1.02382326, "epoch": 0.34222155418608147, "flos": 21907544296320.0, "grad_norm": 2.8975207884020935, "language_loss": 0.68100238, "learning_rate": 2.9516324453678733e-06, "loss": 0.70201677, "num_input_tokens_seen": 122299070, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.4921875, "step": 5692, "time_per_iteration": 2.4295010566711426 }, { "auxiliary_loss_clip": 0.01077801, "auxiliary_loss_mlp": 0.01032918, "balance_loss_clip": 1.01755166, "balance_loss_mlp": 1.02483153, "epoch": 0.34228167743874943, "flos": 18113078868480.0, "grad_norm": 3.498462332068761, "language_loss": 0.71870959, "learning_rate": 2.9513001167542316e-06, "loss": 0.73981678, "num_input_tokens_seen": 122316800, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.53125, "step": 5693, "time_per_iteration": 2.3698456287384033 }, { "auxiliary_loss_clip": 0.01074298, "auxiliary_loss_mlp": 0.01036127, "balance_loss_clip": 1.02198911, "balance_loss_mlp": 1.02347028, "epoch": 0.3423418006914174, "flos": 21287521728000.0, "grad_norm": 1.810947149540552, "language_loss": 0.75413698, "learning_rate": 2.9509677541911363e-06, "loss": 0.7752412, "num_input_tokens_seen": 122335275, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.5078125, "step": 5694, "time_per_iteration": 2.383002996444702 }, { "auxiliary_loss_clip": 0.01074298, "auxiliary_loss_mlp": 0.01026843, "balance_loss_clip": 1.01333082, "balance_loss_mlp": 1.02473545, "epoch": 0.34240192394408536, "flos": 19822601324160.0, "grad_norm": 1.7129228357165343, "language_loss": 0.79348469, "learning_rate": 2.9506353576904483e-06, "loss": 0.81449616, "num_input_tokens_seen": 122353215, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.49609375, "step": 5695, "time_per_iteration": 2.3853683471679688 }, { "auxiliary_loss_clip": 0.01074382, "auxiliary_loss_mlp": 0.01032469, "balance_loss_clip": 1.01774085, "balance_loss_mlp": 1.02322435, "epoch": 0.3424620471967533, "flos": 24534374480640.0, "grad_norm": 1.8839048115373787, "language_loss": 0.73172057, "learning_rate": 2.9503029272640296e-06, "loss": 0.75278914, "num_input_tokens_seen": 122372495, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.51171875, "step": 5696, "time_per_iteration": 2.4111664295196533 }, { "auxiliary_loss_clip": 0.0107503, "auxiliary_loss_mlp": 0.01037978, "balance_loss_clip": 1.02321994, "balance_loss_mlp": 1.02284837, "epoch": 0.3425221704494213, "flos": 25847702294400.0, "grad_norm": 1.7880738321196663, "language_loss": 0.7082808, "learning_rate": 2.9499704629237436e-06, "loss": 0.72941089, "num_input_tokens_seen": 122394600, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.5234375, "step": 5697, "time_per_iteration": 2.456502914428711 }, { "auxiliary_loss_clip": 0.01072197, "auxiliary_loss_mlp": 0.01030383, "balance_loss_clip": 1.01631629, "balance_loss_mlp": 1.02377963, "epoch": 0.34258229370208926, "flos": 21539524988160.0, "grad_norm": 1.919144394090135, "language_loss": 0.81856322, "learning_rate": 2.9496379646814555e-06, "loss": 0.839589, "num_input_tokens_seen": 122414700, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.484375, "step": 5698, "time_per_iteration": 2.4169797897338867 }, { "auxiliary_loss_clip": 0.01074333, "auxiliary_loss_mlp": 0.01036529, "balance_loss_clip": 1.02098382, "balance_loss_mlp": 1.02245975, "epoch": 0.3426424169547573, "flos": 23651840332800.0, "grad_norm": 2.518058786375559, "language_loss": 0.68970788, "learning_rate": 2.949305432549031e-06, "loss": 0.7108165, "num_input_tokens_seen": 122432760, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.51953125, "step": 5699, "time_per_iteration": 2.3943941593170166 }, { "auxiliary_loss_clip": 0.01075163, "auxiliary_loss_mlp": 0.01028546, "balance_loss_clip": 1.01354909, "balance_loss_mlp": 1.0239923, "epoch": 0.34270254020742524, "flos": 24643722458880.0, "grad_norm": 2.2104811468110777, "language_loss": 0.72249305, "learning_rate": 2.9489728665383382e-06, "loss": 0.74353015, "num_input_tokens_seen": 122449105, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.51171875, "step": 5700, "time_per_iteration": 2.397564172744751 }, { "auxiliary_loss_clip": 0.01073098, "auxiliary_loss_mlp": 0.01029279, "balance_loss_clip": 1.01558805, "balance_loss_mlp": 1.02317071, "epoch": 0.3427626634600932, "flos": 20995682739840.0, "grad_norm": 2.6289647790545967, "language_loss": 0.8181535, "learning_rate": 2.948640266661244e-06, "loss": 0.83917725, "num_input_tokens_seen": 122468700, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.5, "step": 5701, "time_per_iteration": 2.3824145793914795 }, { "auxiliary_loss_clip": 0.0107711, "auxiliary_loss_mlp": 0.01034621, "balance_loss_clip": 1.02019644, "balance_loss_mlp": 1.02536964, "epoch": 0.3428227867127612, "flos": 21432725539200.0, "grad_norm": 2.0152050073294117, "language_loss": 0.71497083, "learning_rate": 2.94830763292962e-06, "loss": 0.7360881, "num_input_tokens_seen": 122488160, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.515625, "step": 5702, "time_per_iteration": 2.3891875743865967 }, { "auxiliary_loss_clip": 0.01013416, "auxiliary_loss_mlp": 0.0100492, "balance_loss_clip": 1.0035913, "balance_loss_mlp": 1.00280738, "epoch": 0.34288290996542914, "flos": 55728709827840.0, "grad_norm": 0.7810744800621326, "language_loss": 0.5740124, "learning_rate": 2.9479749653553347e-06, "loss": 0.59419584, "num_input_tokens_seen": 122542890, "router_z_loss_clip": 0.01330566, "router_z_loss_mlp": 0.10644531, "step": 5703, "time_per_iteration": 2.9056954383850098 }, { "auxiliary_loss_clip": 0.01077928, "auxiliary_loss_mlp": 0.01035736, "balance_loss_clip": 1.02021539, "balance_loss_mlp": 1.02564716, "epoch": 0.3429430332180971, "flos": 20155777228800.0, "grad_norm": 1.8204221027755783, "language_loss": 0.75020349, "learning_rate": 2.947642263950262e-06, "loss": 0.77134013, "num_input_tokens_seen": 122561770, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.5234375, "step": 5704, "time_per_iteration": 2.372490406036377 }, { "auxiliary_loss_clip": 0.01074965, "auxiliary_loss_mlp": 0.01029224, "balance_loss_clip": 1.01540124, "balance_loss_mlp": 1.0250175, "epoch": 0.34300315647076507, "flos": 17964942503040.0, "grad_norm": 1.879754833532729, "language_loss": 0.72667003, "learning_rate": 2.947309528726274e-06, "loss": 0.7477119, "num_input_tokens_seen": 122580580, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.5, "step": 5705, "time_per_iteration": 3.7764549255371094 }, { "auxiliary_loss_clip": 0.01073615, "auxiliary_loss_mlp": 0.01026075, "balance_loss_clip": 1.01195419, "balance_loss_mlp": 1.02365065, "epoch": 0.34306327972343303, "flos": 22085845943040.0, "grad_norm": 6.5751739236510875, "language_loss": 0.80007935, "learning_rate": 2.9469767596952463e-06, "loss": 0.82107627, "num_input_tokens_seen": 122599810, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.5, "step": 5706, "time_per_iteration": 2.3817293643951416 }, { "auxiliary_loss_clip": 0.01077495, "auxiliary_loss_mlp": 0.01025772, "balance_loss_clip": 1.01019716, "balance_loss_mlp": 1.02438688, "epoch": 0.343123402976101, "flos": 18441681384960.0, "grad_norm": 2.6645609762034494, "language_loss": 0.82980669, "learning_rate": 2.946643956869054e-06, "loss": 0.85083938, "num_input_tokens_seen": 122616035, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.53125, "step": 5707, "time_per_iteration": 2.365241289138794 }, { "auxiliary_loss_clip": 0.0107762, "auxiliary_loss_mlp": 0.0102833, "balance_loss_clip": 1.01338148, "balance_loss_mlp": 1.0263772, "epoch": 0.34318352622876896, "flos": 17162778038400.0, "grad_norm": 2.960922892708378, "language_loss": 0.75558245, "learning_rate": 2.9463111202595734e-06, "loss": 0.77664202, "num_input_tokens_seen": 122633785, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.51171875, "step": 5708, "time_per_iteration": 2.36161208152771 }, { "auxiliary_loss_clip": 0.01073583, "auxiliary_loss_mlp": 0.01027603, "balance_loss_clip": 1.01321423, "balance_loss_mlp": 1.02404428, "epoch": 0.34324364948143693, "flos": 26686944489600.0, "grad_norm": 1.7596122305417234, "language_loss": 0.81427091, "learning_rate": 2.945978249878683e-06, "loss": 0.8352828, "num_input_tokens_seen": 122652100, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.49609375, "step": 5709, "time_per_iteration": 3.832428216934204 }, { "auxiliary_loss_clip": 0.01076236, "auxiliary_loss_mlp": 0.01037268, "balance_loss_clip": 1.02000666, "balance_loss_mlp": 1.02516234, "epoch": 0.3433037727341049, "flos": 21250513820160.0, "grad_norm": 3.63411230336256, "language_loss": 0.78820145, "learning_rate": 2.9456453457382628e-06, "loss": 0.80933654, "num_input_tokens_seen": 122669720, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.51171875, "step": 5710, "time_per_iteration": 2.3868179321289062 }, { "auxiliary_loss_clip": 0.01077579, "auxiliary_loss_mlp": 0.01034373, "balance_loss_clip": 1.017946, "balance_loss_mlp": 1.02373505, "epoch": 0.34336389598677286, "flos": 20628431481600.0, "grad_norm": 1.6910221146640076, "language_loss": 0.69980818, "learning_rate": 2.9453124078501926e-06, "loss": 0.72092772, "num_input_tokens_seen": 122688715, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.5390625, "step": 5711, "time_per_iteration": 2.376746892929077 }, { "auxiliary_loss_clip": 0.01074128, "auxiliary_loss_mlp": 0.01029468, "balance_loss_clip": 1.01438832, "balance_loss_mlp": 1.02360702, "epoch": 0.3434240192394409, "flos": 14537693422080.0, "grad_norm": 1.952955427006162, "language_loss": 0.67864966, "learning_rate": 2.944979436226354e-06, "loss": 0.69968557, "num_input_tokens_seen": 122706970, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.50390625, "step": 5712, "time_per_iteration": 2.3456666469573975 }, { "auxiliary_loss_clip": 0.01012491, "auxiliary_loss_mlp": 0.01004574, "balance_loss_clip": 1.00324452, "balance_loss_mlp": 1.00178766, "epoch": 0.34348414249210885, "flos": 58048828784640.0, "grad_norm": 1.3914194816331944, "language_loss": 0.58103466, "learning_rate": 2.94464643087863e-06, "loss": 0.60120523, "num_input_tokens_seen": 122758095, "router_z_loss_clip": 0.01330566, "router_z_loss_mlp": 0.10742188, "step": 5713, "time_per_iteration": 2.9882845878601074 }, { "auxiliary_loss_clip": 0.01073632, "auxiliary_loss_mlp": 0.010309, "balance_loss_clip": 1.01583195, "balance_loss_mlp": 1.02346694, "epoch": 0.3435442657447768, "flos": 20703389904000.0, "grad_norm": 1.8318763180632778, "language_loss": 0.805336, "learning_rate": 2.9443133918189054e-06, "loss": 0.82638127, "num_input_tokens_seen": 122777815, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.5, "step": 5714, "time_per_iteration": 3.7624547481536865 }, { "auxiliary_loss_clip": 0.01075892, "auxiliary_loss_mlp": 0.01027368, "balance_loss_clip": 1.01227617, "balance_loss_mlp": 1.02396405, "epoch": 0.3436043889974448, "flos": 22929137856000.0, "grad_norm": 1.9340453784652882, "language_loss": 0.7198323, "learning_rate": 2.943980319059064e-06, "loss": 0.74086487, "num_input_tokens_seen": 122797555, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.51953125, "step": 5715, "time_per_iteration": 2.388002872467041 }, { "auxiliary_loss_clip": 0.01012159, "auxiliary_loss_mlp": 0.01004033, "balance_loss_clip": 1.00257826, "balance_loss_mlp": 1.00159621, "epoch": 0.34366451225011274, "flos": 58399914216960.0, "grad_norm": 0.961472637748732, "language_loss": 0.65866983, "learning_rate": 2.9436472126109943e-06, "loss": 0.6788317, "num_input_tokens_seen": 122863955, "router_z_loss_clip": 0.01452637, "router_z_loss_mlp": 0.10546875, "step": 5716, "time_per_iteration": 3.1248884201049805 }, { "auxiliary_loss_clip": 0.0107796, "auxiliary_loss_mlp": 0.01033594, "balance_loss_clip": 1.01971757, "balance_loss_mlp": 1.02654815, "epoch": 0.3437246355027807, "flos": 15595387194240.0, "grad_norm": 1.9599620798169002, "language_loss": 0.7386241, "learning_rate": 2.9433140724865824e-06, "loss": 0.75973964, "num_input_tokens_seen": 122883000, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.515625, "step": 5717, "time_per_iteration": 2.3608038425445557 }, { "auxiliary_loss_clip": 0.01074512, "auxiliary_loss_mlp": 0.01028792, "balance_loss_clip": 1.01446342, "balance_loss_mlp": 1.02328563, "epoch": 0.34378475875544867, "flos": 27671041382400.0, "grad_norm": 1.7242431315742077, "language_loss": 0.75316566, "learning_rate": 2.9429808986977175e-06, "loss": 0.77419877, "num_input_tokens_seen": 122903265, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.51171875, "step": 5718, "time_per_iteration": 2.4267332553863525 }, { "auxiliary_loss_clip": 0.01074498, "auxiliary_loss_mlp": 0.01037326, "balance_loss_clip": 1.02221656, "balance_loss_mlp": 1.02298272, "epoch": 0.34384488200811664, "flos": 31430139736320.0, "grad_norm": 2.09727195742519, "language_loss": 0.63215935, "learning_rate": 2.9426476912562905e-06, "loss": 0.65327752, "num_input_tokens_seen": 122923860, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.515625, "step": 5719, "time_per_iteration": 2.4573211669921875 }, { "auxiliary_loss_clip": 0.01076853, "auxiliary_loss_mlp": 0.01033549, "balance_loss_clip": 1.01659739, "balance_loss_mlp": 1.02301538, "epoch": 0.3439050052607846, "flos": 24898763007360.0, "grad_norm": 2.7939078417818957, "language_loss": 0.73268723, "learning_rate": 2.9423144501741918e-06, "loss": 0.75379127, "num_input_tokens_seen": 122945305, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.5390625, "step": 5720, "time_per_iteration": 2.404113531112671 }, { "auxiliary_loss_clip": 0.0107499, "auxiliary_loss_mlp": 0.01033338, "balance_loss_clip": 1.01810896, "balance_loss_mlp": 1.02303505, "epoch": 0.34396512851345257, "flos": 18149109258240.0, "grad_norm": 3.081595879074298, "language_loss": 0.73838741, "learning_rate": 2.9419811754633143e-06, "loss": 0.7594707, "num_input_tokens_seen": 122962535, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.51953125, "step": 5721, "time_per_iteration": 2.3456480503082275 }, { "auxiliary_loss_clip": 0.01077279, "auxiliary_loss_mlp": 0.01037928, "balance_loss_clip": 1.02213287, "balance_loss_mlp": 1.02426636, "epoch": 0.34402525176612053, "flos": 16033512245760.0, "grad_norm": 2.3607524072250174, "language_loss": 0.80540323, "learning_rate": 2.9416478671355516e-06, "loss": 0.82655531, "num_input_tokens_seen": 122979750, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.53125, "step": 5722, "time_per_iteration": 2.3664419651031494 }, { "auxiliary_loss_clip": 0.01075159, "auxiliary_loss_mlp": 0.0102811, "balance_loss_clip": 1.01406729, "balance_loss_mlp": 1.0242641, "epoch": 0.3440853750187885, "flos": 21177580256640.0, "grad_norm": 1.6469063943501463, "language_loss": 0.81590909, "learning_rate": 2.9413145252027985e-06, "loss": 0.83694184, "num_input_tokens_seen": 122998955, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.5078125, "step": 5723, "time_per_iteration": 2.3793845176696777 }, { "auxiliary_loss_clip": 0.01075197, "auxiliary_loss_mlp": 0.01032864, "balance_loss_clip": 1.01790333, "balance_loss_mlp": 1.02287889, "epoch": 0.34414549827145646, "flos": 12677032224000.0, "grad_norm": 2.004536861061174, "language_loss": 0.81350088, "learning_rate": 2.940981149676952e-06, "loss": 0.83458149, "num_input_tokens_seen": 123016165, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.5234375, "step": 5724, "time_per_iteration": 2.359459400177002 }, { "auxiliary_loss_clip": 0.01076813, "auxiliary_loss_mlp": 0.01035081, "balance_loss_clip": 1.01961946, "balance_loss_mlp": 1.02426028, "epoch": 0.3442056215241244, "flos": 31283190357120.0, "grad_norm": 1.7974641488688818, "language_loss": 0.69345838, "learning_rate": 2.940647740569908e-06, "loss": 0.71457732, "num_input_tokens_seen": 123036900, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.52734375, "step": 5725, "time_per_iteration": 2.451002359390259 }, { "auxiliary_loss_clip": 0.01079179, "auxiliary_loss_mlp": 0.01039052, "balance_loss_clip": 1.02187443, "balance_loss_mlp": 1.02365041, "epoch": 0.34426574477679245, "flos": 23366180655360.0, "grad_norm": 1.3771175473156736, "language_loss": 0.69205964, "learning_rate": 2.9403142978935665e-06, "loss": 0.71324199, "num_input_tokens_seen": 123057480, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.5546875, "step": 5726, "time_per_iteration": 2.3979878425598145 }, { "auxiliary_loss_clip": 0.01074969, "auxiliary_loss_mlp": 0.01031835, "balance_loss_clip": 1.01743448, "balance_loss_mlp": 1.02418554, "epoch": 0.3443258680294604, "flos": 24534269746560.0, "grad_norm": 1.8254818200113507, "language_loss": 0.72980255, "learning_rate": 2.939980821659826e-06, "loss": 0.75087065, "num_input_tokens_seen": 123076890, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.5078125, "step": 5727, "time_per_iteration": 2.400801658630371 }, { "auxiliary_loss_clip": 0.01074399, "auxiliary_loss_mlp": 0.01032277, "balance_loss_clip": 1.01679194, "balance_loss_mlp": 1.02309775, "epoch": 0.3443859912821284, "flos": 20229094817280.0, "grad_norm": 2.052924977815989, "language_loss": 0.8778612, "learning_rate": 2.9396473118805886e-06, "loss": 0.89892799, "num_input_tokens_seen": 123092530, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.51171875, "step": 5728, "time_per_iteration": 2.3673133850097656 }, { "auxiliary_loss_clip": 0.01074814, "auxiliary_loss_mlp": 0.01028116, "balance_loss_clip": 1.01354897, "balance_loss_mlp": 1.02365649, "epoch": 0.34444611453479634, "flos": 24315364321920.0, "grad_norm": 2.36386125166066, "language_loss": 0.70071566, "learning_rate": 2.9393137685677555e-06, "loss": 0.72174501, "num_input_tokens_seen": 123110560, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.51171875, "step": 5729, "time_per_iteration": 2.3869028091430664 }, { "auxiliary_loss_clip": 0.01072309, "auxiliary_loss_mlp": 0.01026316, "balance_loss_clip": 1.01136732, "balance_loss_mlp": 1.02240229, "epoch": 0.3445062377874643, "flos": 16982451532800.0, "grad_norm": 1.9786301324215365, "language_loss": 0.74284554, "learning_rate": 2.9389801917332294e-06, "loss": 0.76383179, "num_input_tokens_seen": 123128655, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.5, "step": 5730, "time_per_iteration": 2.359952688217163 }, { "auxiliary_loss_clip": 0.01075309, "auxiliary_loss_mlp": 0.0103268, "balance_loss_clip": 1.01738501, "balance_loss_mlp": 1.02374601, "epoch": 0.3445663610401323, "flos": 20301679267200.0, "grad_norm": 2.4256585009243996, "language_loss": 0.79217422, "learning_rate": 2.938646581388917e-06, "loss": 0.81325412, "num_input_tokens_seen": 123145130, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.515625, "step": 5731, "time_per_iteration": 2.354198932647705 }, { "auxiliary_loss_clip": 0.01073332, "auxiliary_loss_mlp": 0.01030458, "balance_loss_clip": 1.01530051, "balance_loss_mlp": 1.02286911, "epoch": 0.34462648429280024, "flos": 15887191271040.0, "grad_norm": 1.7999135472334495, "language_loss": 0.7846632, "learning_rate": 2.9383129375467214e-06, "loss": 0.80570114, "num_input_tokens_seen": 123162265, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.50390625, "step": 5732, "time_per_iteration": 2.3416993618011475 }, { "auxiliary_loss_clip": 0.01016597, "auxiliary_loss_mlp": 0.01018421, "balance_loss_clip": 1.01696038, "balance_loss_mlp": 1.00597429, "epoch": 0.3446866075454682, "flos": 59307760967040.0, "grad_norm": 0.7502479164887117, "language_loss": 0.53450692, "learning_rate": 2.937979260218551e-06, "loss": 0.55485713, "num_input_tokens_seen": 123218620, "router_z_loss_clip": 0.0145874, "router_z_loss_mlp": 0.10644531, "step": 5733, "time_per_iteration": 3.051616668701172 }, { "auxiliary_loss_clip": 0.01077458, "auxiliary_loss_mlp": 0.0103427, "balance_loss_clip": 1.01828384, "balance_loss_mlp": 1.02557707, "epoch": 0.34474673079813617, "flos": 22342771704960.0, "grad_norm": 1.8022796957976845, "language_loss": 0.83247119, "learning_rate": 2.9376455494163137e-06, "loss": 0.8535884, "num_input_tokens_seen": 123237325, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.51953125, "step": 5734, "time_per_iteration": 2.3901491165161133 }, { "auxiliary_loss_clip": 0.01077648, "auxiliary_loss_mlp": 0.01029249, "balance_loss_clip": 1.01475275, "balance_loss_mlp": 1.02484977, "epoch": 0.34480685405080413, "flos": 27668981612160.0, "grad_norm": 1.8166628704497338, "language_loss": 0.92839354, "learning_rate": 2.9373118051519185e-06, "loss": 0.94946253, "num_input_tokens_seen": 123258650, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.52734375, "step": 5735, "time_per_iteration": 2.4364943504333496 }, { "auxiliary_loss_clip": 0.01078311, "auxiliary_loss_mlp": 0.01037926, "balance_loss_clip": 1.02165425, "balance_loss_mlp": 1.02551818, "epoch": 0.3448669773034721, "flos": 22454912592000.0, "grad_norm": 1.738102938193459, "language_loss": 0.76464218, "learning_rate": 2.936978027437276e-06, "loss": 0.78580451, "num_input_tokens_seen": 123277155, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.52734375, "step": 5736, "time_per_iteration": 2.4330787658691406 }, { "auxiliary_loss_clip": 0.01079526, "auxiliary_loss_mlp": 0.01035283, "balance_loss_clip": 1.01942849, "balance_loss_mlp": 1.02621675, "epoch": 0.34492710055614006, "flos": 24935037776640.0, "grad_norm": 1.5953825999955018, "language_loss": 0.7859174, "learning_rate": 2.9366442162842976e-06, "loss": 0.80706549, "num_input_tokens_seen": 123297640, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.53515625, "step": 5737, "time_per_iteration": 2.4368796348571777 }, { "auxiliary_loss_clip": 0.01078198, "auxiliary_loss_mlp": 0.01035162, "balance_loss_clip": 1.01867485, "balance_loss_mlp": 1.02395606, "epoch": 0.34498722380880803, "flos": 20119781750400.0, "grad_norm": 2.3215257339896884, "language_loss": 0.72001284, "learning_rate": 2.936310371704897e-06, "loss": 0.74114645, "num_input_tokens_seen": 123314370, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.54296875, "step": 5738, "time_per_iteration": 2.37078595161438 }, { "auxiliary_loss_clip": 0.01077618, "auxiliary_loss_mlp": 0.01036955, "balance_loss_clip": 1.0200274, "balance_loss_mlp": 1.02273893, "epoch": 0.34504734706147605, "flos": 28436896166400.0, "grad_norm": 1.927403471930639, "language_loss": 0.81537116, "learning_rate": 2.9359764937109877e-06, "loss": 0.83651686, "num_input_tokens_seen": 123336085, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.546875, "step": 5739, "time_per_iteration": 2.4279112815856934 }, { "auxiliary_loss_clip": 0.01077766, "auxiliary_loss_mlp": 0.01030833, "balance_loss_clip": 1.01411963, "balance_loss_mlp": 1.02528167, "epoch": 0.345107470314144, "flos": 22673364168960.0, "grad_norm": 1.8140352206923913, "language_loss": 0.82550031, "learning_rate": 2.9356425823144847e-06, "loss": 0.84658629, "num_input_tokens_seen": 123354460, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.5234375, "step": 5740, "time_per_iteration": 2.38163423538208 }, { "auxiliary_loss_clip": 0.01076628, "auxiliary_loss_mlp": 0.01036161, "balance_loss_clip": 1.02053297, "balance_loss_mlp": 1.02384496, "epoch": 0.345167593566812, "flos": 20629688290560.0, "grad_norm": 2.20053714675375, "language_loss": 0.76983535, "learning_rate": 2.9353086375273047e-06, "loss": 0.79096317, "num_input_tokens_seen": 123373420, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.52734375, "step": 5741, "time_per_iteration": 2.39556622505188 }, { "auxiliary_loss_clip": 0.01077453, "auxiliary_loss_mlp": 0.01035437, "balance_loss_clip": 1.01949894, "balance_loss_mlp": 1.02289307, "epoch": 0.34522771681947995, "flos": 26213138161920.0, "grad_norm": 2.7080009040108686, "language_loss": 0.76901495, "learning_rate": 2.9349746593613654e-06, "loss": 0.79014397, "num_input_tokens_seen": 123394730, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.546875, "step": 5742, "time_per_iteration": 2.4205451011657715 }, { "auxiliary_loss_clip": 0.01076968, "auxiliary_loss_mlp": 0.01030963, "balance_loss_clip": 1.01612163, "balance_loss_mlp": 1.02513111, "epoch": 0.3452878400721479, "flos": 19061354839680.0, "grad_norm": 2.1362010721961755, "language_loss": 0.75643522, "learning_rate": 2.934640647828586e-06, "loss": 0.77751452, "num_input_tokens_seen": 123412895, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.51953125, "step": 5743, "time_per_iteration": 2.3760452270507812 }, { "auxiliary_loss_clip": 0.01076572, "auxiliary_loss_mlp": 0.0103158, "balance_loss_clip": 1.01658332, "balance_loss_mlp": 1.02589667, "epoch": 0.3453479633248159, "flos": 27928455903360.0, "grad_norm": 1.7423094726165422, "language_loss": 0.70363706, "learning_rate": 2.934306602940885e-06, "loss": 0.72471857, "num_input_tokens_seen": 123432320, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.5078125, "step": 5744, "time_per_iteration": 3.9053919315338135 }, { "auxiliary_loss_clip": 0.01076273, "auxiliary_loss_mlp": 0.01036956, "balance_loss_clip": 1.02117252, "balance_loss_mlp": 1.02527165, "epoch": 0.34540808657748384, "flos": 19605197088000.0, "grad_norm": 1.730994897244149, "language_loss": 0.79508853, "learning_rate": 2.9339725247101855e-06, "loss": 0.81622088, "num_input_tokens_seen": 123450980, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.5078125, "step": 5745, "time_per_iteration": 2.3645341396331787 }, { "auxiliary_loss_clip": 0.01078754, "auxiliary_loss_mlp": 0.01037973, "balance_loss_clip": 1.02135563, "balance_loss_mlp": 1.02447414, "epoch": 0.3454682098301518, "flos": 20410643220480.0, "grad_norm": 2.0932756279079814, "language_loss": 0.89304304, "learning_rate": 2.933638413148409e-06, "loss": 0.91421044, "num_input_tokens_seen": 123469365, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.54296875, "step": 5746, "time_per_iteration": 2.365699052810669 }, { "auxiliary_loss_clip": 0.0107653, "auxiliary_loss_mlp": 0.0103436, "balance_loss_clip": 1.0184691, "balance_loss_mlp": 1.02264893, "epoch": 0.34552833308281977, "flos": 21324040876800.0, "grad_norm": 1.9816292143558374, "language_loss": 0.63900352, "learning_rate": 2.9333042682674788e-06, "loss": 0.66011238, "num_input_tokens_seen": 123489425, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.5390625, "step": 5747, "time_per_iteration": 2.3807475566864014 }, { "auxiliary_loss_clip": 0.01076452, "auxiliary_loss_mlp": 0.01030777, "balance_loss_clip": 1.01575661, "balance_loss_mlp": 1.02542019, "epoch": 0.34558845633548774, "flos": 36242253740160.0, "grad_norm": 3.251906901895553, "language_loss": 0.72763515, "learning_rate": 2.9329700900793207e-06, "loss": 0.74870741, "num_input_tokens_seen": 123509970, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.5078125, "step": 5748, "time_per_iteration": 5.37058424949646 }, { "auxiliary_loss_clip": 0.0107386, "auxiliary_loss_mlp": 0.01025661, "balance_loss_clip": 1.0116837, "balance_loss_mlp": 1.02340102, "epoch": 0.3456485795881557, "flos": 22449606065280.0, "grad_norm": 1.574322390276474, "language_loss": 0.75496805, "learning_rate": 2.9326358785958593e-06, "loss": 0.77596331, "num_input_tokens_seen": 123531055, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.50390625, "step": 5749, "time_per_iteration": 2.428903818130493 }, { "auxiliary_loss_clip": 0.0101205, "auxiliary_loss_mlp": 0.01000613, "balance_loss_clip": 0.99915814, "balance_loss_mlp": 1.00142384, "epoch": 0.34570870284082367, "flos": 62001135936000.0, "grad_norm": 0.8768927374316948, "language_loss": 0.62612498, "learning_rate": 2.9323016338290227e-06, "loss": 0.64625168, "num_input_tokens_seen": 123584720, "router_z_loss_clip": 0.01452637, "router_z_loss_mlp": 0.10644531, "step": 5750, "time_per_iteration": 2.904465675354004 }, { "auxiliary_loss_clip": 0.01070569, "auxiliary_loss_mlp": 0.01026406, "balance_loss_clip": 1.0119102, "balance_loss_mlp": 1.02240634, "epoch": 0.34576882609349163, "flos": 22781141136000.0, "grad_norm": 1.781390994107412, "language_loss": 0.80388081, "learning_rate": 2.931967355790739e-06, "loss": 0.82485062, "num_input_tokens_seen": 123604465, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.48242188, "step": 5751, "time_per_iteration": 2.4359335899353027 }, { "auxiliary_loss_clip": 0.01074952, "auxiliary_loss_mlp": 0.01034677, "balance_loss_clip": 1.02043736, "balance_loss_mlp": 1.02548003, "epoch": 0.34582894934615965, "flos": 12348010771200.0, "grad_norm": 2.0951007095316347, "language_loss": 0.83903956, "learning_rate": 2.931633044492937e-06, "loss": 0.86013579, "num_input_tokens_seen": 123622320, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.49609375, "step": 5752, "time_per_iteration": 2.346137285232544 }, { "auxiliary_loss_clip": 0.0101195, "auxiliary_loss_mlp": 0.01000668, "balance_loss_clip": 0.99930316, "balance_loss_mlp": 1.00140488, "epoch": 0.3458890725988276, "flos": 70164563080320.0, "grad_norm": 0.7362709478195214, "language_loss": 0.63234472, "learning_rate": 2.931298699947549e-06, "loss": 0.65247089, "num_input_tokens_seen": 123678010, "router_z_loss_clip": 0.01367188, "router_z_loss_mlp": 0.10546875, "step": 5753, "time_per_iteration": 4.350102186203003 }, { "auxiliary_loss_clip": 0.01074815, "auxiliary_loss_mlp": 0.01036552, "balance_loss_clip": 1.02073264, "balance_loss_mlp": 1.02396595, "epoch": 0.3459491958514956, "flos": 17091624954240.0, "grad_norm": 1.9361911633317213, "language_loss": 0.71062148, "learning_rate": 2.9309643221665054e-06, "loss": 0.73173523, "num_input_tokens_seen": 123696830, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.5078125, "step": 5754, "time_per_iteration": 2.382948875427246 }, { "auxiliary_loss_clip": 0.01076214, "auxiliary_loss_mlp": 0.01029767, "balance_loss_clip": 1.01380539, "balance_loss_mlp": 1.02380681, "epoch": 0.34600931910416355, "flos": 16650113500800.0, "grad_norm": 1.7727484939524747, "language_loss": 0.72692454, "learning_rate": 2.9306299111617402e-06, "loss": 0.74798429, "num_input_tokens_seen": 123714360, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5234375, "step": 5755, "time_per_iteration": 2.402243137359619 }, { "auxiliary_loss_clip": 0.01074005, "auxiliary_loss_mlp": 0.01033562, "balance_loss_clip": 1.01872611, "balance_loss_mlp": 1.0241015, "epoch": 0.3460694423568315, "flos": 38544635859840.0, "grad_norm": 1.5349196514409829, "language_loss": 0.72659063, "learning_rate": 2.9302954669451875e-06, "loss": 0.74766636, "num_input_tokens_seen": 123739250, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.49804688, "step": 5756, "time_per_iteration": 2.5276730060577393 }, { "auxiliary_loss_clip": 0.01012249, "auxiliary_loss_mlp": 0.0100983, "balance_loss_clip": 1.00842357, "balance_loss_mlp": 1.00163567, "epoch": 0.3461295656094995, "flos": 72077837564160.0, "grad_norm": 0.7095907827505381, "language_loss": 0.62560987, "learning_rate": 2.9299609895287817e-06, "loss": 0.64583063, "num_input_tokens_seen": 123802845, "router_z_loss_clip": 0.01403809, "router_z_loss_mlp": 0.10644531, "step": 5757, "time_per_iteration": 3.0415329933166504 }, { "auxiliary_loss_clip": 0.01011606, "auxiliary_loss_mlp": 0.01005802, "balance_loss_clip": 1.00447905, "balance_loss_mlp": 1.00110459, "epoch": 0.34618968886216744, "flos": 65457118932480.0, "grad_norm": 0.8188537340470671, "language_loss": 0.59229028, "learning_rate": 2.929626478924461e-06, "loss": 0.61246443, "num_input_tokens_seen": 123861805, "router_z_loss_clip": 0.01324463, "router_z_loss_mlp": 0.10498047, "step": 5758, "time_per_iteration": 3.0388827323913574 }, { "auxiliary_loss_clip": 0.01075919, "auxiliary_loss_mlp": 0.01036041, "balance_loss_clip": 1.02117562, "balance_loss_mlp": 1.02438259, "epoch": 0.3462498121148354, "flos": 23471548738560.0, "grad_norm": 1.8880444198446622, "language_loss": 0.71870965, "learning_rate": 2.9292919351441626e-06, "loss": 0.73982924, "num_input_tokens_seen": 123881820, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.515625, "step": 5759, "time_per_iteration": 2.408294677734375 }, { "auxiliary_loss_clip": 0.01074462, "auxiliary_loss_mlp": 0.01038014, "balance_loss_clip": 1.02254105, "balance_loss_mlp": 1.02385116, "epoch": 0.3463099353675034, "flos": 24169636840320.0, "grad_norm": 1.8755606585016409, "language_loss": 0.83612382, "learning_rate": 2.928957358199825e-06, "loss": 0.8572486, "num_input_tokens_seen": 123903700, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.5078125, "step": 5760, "time_per_iteration": 2.436171770095825 }, { "auxiliary_loss_clip": 0.01011966, "auxiliary_loss_mlp": 0.01001194, "balance_loss_clip": 0.99995399, "balance_loss_mlp": 1.00140882, "epoch": 0.34637005862017134, "flos": 63697915745280.0, "grad_norm": 0.8139112462603719, "language_loss": 0.56570119, "learning_rate": 2.9286227481033903e-06, "loss": 0.58583277, "num_input_tokens_seen": 123960075, "router_z_loss_clip": 0.01239014, "router_z_loss_mlp": 0.10546875, "step": 5761, "time_per_iteration": 3.1018810272216797 }, { "auxiliary_loss_clip": 0.01073732, "auxiliary_loss_mlp": 0.01032484, "balance_loss_clip": 1.01768374, "balance_loss_mlp": 1.02344429, "epoch": 0.3464301818728393, "flos": 13144868709120.0, "grad_norm": 2.0034356663293305, "language_loss": 0.94723499, "learning_rate": 2.9282881048667972e-06, "loss": 0.96829712, "num_input_tokens_seen": 123975805, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.50390625, "step": 5762, "time_per_iteration": 2.357673168182373 }, { "auxiliary_loss_clip": 0.01076457, "auxiliary_loss_mlp": 0.01029967, "balance_loss_clip": 1.01435089, "balance_loss_mlp": 1.02338696, "epoch": 0.34649030512550727, "flos": 29313879408000.0, "grad_norm": 1.7542001792396027, "language_loss": 0.69893539, "learning_rate": 2.927953428501989e-06, "loss": 0.71999967, "num_input_tokens_seen": 123997530, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.53125, "step": 5763, "time_per_iteration": 2.442671775817871 }, { "auxiliary_loss_clip": 0.01079704, "auxiliary_loss_mlp": 0.01036954, "balance_loss_clip": 1.019526, "balance_loss_mlp": 1.02577722, "epoch": 0.34655042837817523, "flos": 23729801132160.0, "grad_norm": 1.6769862060415226, "language_loss": 0.83350599, "learning_rate": 2.9276187190209107e-06, "loss": 0.85467255, "num_input_tokens_seen": 124016375, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.5390625, "step": 5764, "time_per_iteration": 2.403332471847534 }, { "auxiliary_loss_clip": 0.01075342, "auxiliary_loss_mlp": 0.01026728, "balance_loss_clip": 1.01154089, "balance_loss_mlp": 1.02309275, "epoch": 0.34661055163084326, "flos": 22053132132480.0, "grad_norm": 2.2522769181028206, "language_loss": 0.67345691, "learning_rate": 2.927283976435506e-06, "loss": 0.69447756, "num_input_tokens_seen": 124033975, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.5234375, "step": 5765, "time_per_iteration": 2.3712351322174072 }, { "auxiliary_loss_clip": 0.01076218, "auxiliary_loss_mlp": 0.01035711, "balance_loss_clip": 1.02057719, "balance_loss_mlp": 1.0238992, "epoch": 0.3466706748835112, "flos": 21798126495360.0, "grad_norm": 2.5534336520752845, "language_loss": 0.76755512, "learning_rate": 2.926949200757722e-06, "loss": 0.78867447, "num_input_tokens_seen": 124051930, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.5234375, "step": 5766, "time_per_iteration": 2.39005184173584 }, { "auxiliary_loss_clip": 0.01072689, "auxiliary_loss_mlp": 0.01028991, "balance_loss_clip": 1.01452541, "balance_loss_mlp": 1.02277708, "epoch": 0.3467307981361792, "flos": 19460726415360.0, "grad_norm": 1.3885360457512503, "language_loss": 0.73505926, "learning_rate": 2.926614391999505e-06, "loss": 0.7560761, "num_input_tokens_seen": 124071220, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.5, "step": 5767, "time_per_iteration": 2.362863302230835 }, { "auxiliary_loss_clip": 0.01078442, "auxiliary_loss_mlp": 0.01032352, "balance_loss_clip": 1.01603234, "balance_loss_mlp": 1.02600789, "epoch": 0.34679092138884715, "flos": 24826283291520.0, "grad_norm": 1.7762556141611965, "language_loss": 0.77778924, "learning_rate": 2.926279550172804e-06, "loss": 0.79889715, "num_input_tokens_seen": 124090140, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.5234375, "step": 5768, "time_per_iteration": 2.4200541973114014 }, { "auxiliary_loss_clip": 0.01013231, "auxiliary_loss_mlp": 0.01001561, "balance_loss_clip": 1.00019014, "balance_loss_mlp": 1.00275207, "epoch": 0.3468510446415151, "flos": 63233116548480.0, "grad_norm": 0.7683543475722181, "language_loss": 0.57456195, "learning_rate": 2.9259446752895686e-06, "loss": 0.59470987, "num_input_tokens_seen": 124152025, "router_z_loss_clip": 0.01373291, "router_z_loss_mlp": 0.10498047, "step": 5769, "time_per_iteration": 3.068924903869629 }, { "auxiliary_loss_clip": 0.01083012, "auxiliary_loss_mlp": 0.01032253, "balance_loss_clip": 1.01468158, "balance_loss_mlp": 1.02535903, "epoch": 0.3469111678941831, "flos": 12120168038400.0, "grad_norm": 3.5004369628595042, "language_loss": 0.86012661, "learning_rate": 2.9256097673617495e-06, "loss": 0.88127929, "num_input_tokens_seen": 124165795, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.578125, "step": 5770, "time_per_iteration": 2.387011766433716 }, { "auxiliary_loss_clip": 0.01011965, "auxiliary_loss_mlp": 0.01001873, "balance_loss_clip": 1.00060976, "balance_loss_mlp": 1.0014503, "epoch": 0.34697129114685105, "flos": 65931134728320.0, "grad_norm": 0.7645425760205378, "language_loss": 0.59758162, "learning_rate": 2.9252748264012985e-06, "loss": 0.61772001, "num_input_tokens_seen": 124222925, "router_z_loss_clip": 0.01263428, "router_z_loss_mlp": 0.10546875, "step": 5771, "time_per_iteration": 2.822774648666382 }, { "auxiliary_loss_clip": 0.01074503, "auxiliary_loss_mlp": 0.01032142, "balance_loss_clip": 1.01688278, "balance_loss_mlp": 1.02360356, "epoch": 0.347031414399519, "flos": 34452920183040.0, "grad_norm": 1.6598753304082707, "language_loss": 0.71974301, "learning_rate": 2.9249398524201693e-06, "loss": 0.74080938, "num_input_tokens_seen": 124240915, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.5078125, "step": 5772, "time_per_iteration": 2.4949748516082764 }, { "auxiliary_loss_clip": 0.01076371, "auxiliary_loss_mlp": 0.01028959, "balance_loss_clip": 1.01287127, "balance_loss_mlp": 1.02321267, "epoch": 0.347091537652187, "flos": 26942892733440.0, "grad_norm": 1.3769018633624883, "language_loss": 0.76191044, "learning_rate": 2.9246048454303165e-06, "loss": 0.78296363, "num_input_tokens_seen": 124262770, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.53125, "step": 5773, "time_per_iteration": 2.4566125869750977 }, { "auxiliary_loss_clip": 0.01076734, "auxiliary_loss_mlp": 0.01033058, "balance_loss_clip": 1.01663136, "balance_loss_mlp": 1.02347696, "epoch": 0.34715166090485494, "flos": 21141165841920.0, "grad_norm": 2.3084275675194594, "language_loss": 0.7031635, "learning_rate": 2.9242698054436942e-06, "loss": 0.72426146, "num_input_tokens_seen": 124280950, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.53125, "step": 5774, "time_per_iteration": 2.4065067768096924 }, { "auxiliary_loss_clip": 0.01074918, "auxiliary_loss_mlp": 0.01032588, "balance_loss_clip": 1.01771677, "balance_loss_mlp": 1.02435637, "epoch": 0.3472117841575229, "flos": 23476855265280.0, "grad_norm": 1.6153572545834267, "language_loss": 0.76111162, "learning_rate": 2.9239347324722605e-06, "loss": 0.78218669, "num_input_tokens_seen": 124299540, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.50390625, "step": 5775, "time_per_iteration": 2.5024466514587402 }, { "auxiliary_loss_clip": 0.01077937, "auxiliary_loss_mlp": 0.01031097, "balance_loss_clip": 1.0144968, "balance_loss_mlp": 1.02390623, "epoch": 0.34727190741019087, "flos": 17491869313920.0, "grad_norm": 2.0187230457228624, "language_loss": 0.77591276, "learning_rate": 2.923599626527973e-06, "loss": 0.79700303, "num_input_tokens_seen": 124316285, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.5390625, "step": 5776, "time_per_iteration": 2.374683380126953 }, { "auxiliary_loss_clip": 0.01014615, "auxiliary_loss_mlp": 0.01000357, "balance_loss_clip": 0.99898612, "balance_loss_mlp": 1.00433517, "epoch": 0.34733203066285884, "flos": 65261848498560.0, "grad_norm": 1.13396177879421, "language_loss": 0.63349223, "learning_rate": 2.9232644876227904e-06, "loss": 0.65364194, "num_input_tokens_seen": 124376650, "router_z_loss_clip": 0.01373291, "router_z_loss_mlp": 0.10253906, "step": 5777, "time_per_iteration": 3.0906662940979004 }, { "auxiliary_loss_clip": 0.0107635, "auxiliary_loss_mlp": 0.01031915, "balance_loss_clip": 1.01598871, "balance_loss_mlp": 1.02296972, "epoch": 0.3473921539155268, "flos": 28657442424960.0, "grad_norm": 1.828737695489423, "language_loss": 0.64427119, "learning_rate": 2.9229293157686732e-06, "loss": 0.66535383, "num_input_tokens_seen": 124396475, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.53125, "step": 5778, "time_per_iteration": 2.5096380710601807 }, { "auxiliary_loss_clip": 0.01079001, "auxiliary_loss_mlp": 0.01033326, "balance_loss_clip": 1.0173161, "balance_loss_mlp": 1.02501488, "epoch": 0.3474522771681948, "flos": 40835497230720.0, "grad_norm": 1.6874462169990343, "language_loss": 0.71371233, "learning_rate": 2.9225941109775825e-06, "loss": 0.73483562, "num_input_tokens_seen": 124416480, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5390625, "step": 5779, "time_per_iteration": 2.5633721351623535 }, { "auxiliary_loss_clip": 0.01080528, "auxiliary_loss_mlp": 0.01037824, "balance_loss_clip": 1.02174294, "balance_loss_mlp": 1.02552652, "epoch": 0.3475124004208628, "flos": 24607412778240.0, "grad_norm": 2.043754218686711, "language_loss": 0.62217283, "learning_rate": 2.9222588732614818e-06, "loss": 0.64335632, "num_input_tokens_seen": 124435950, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.55078125, "step": 5780, "time_per_iteration": 2.4177258014678955 }, { "auxiliary_loss_clip": 0.01075642, "auxiliary_loss_mlp": 0.0103364, "balance_loss_clip": 1.01859534, "balance_loss_mlp": 1.02438533, "epoch": 0.34757252367353075, "flos": 22710197520000.0, "grad_norm": 1.6086040301959126, "language_loss": 0.72151911, "learning_rate": 2.921923602632333e-06, "loss": 0.74261189, "num_input_tokens_seen": 124455410, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.51171875, "step": 5781, "time_per_iteration": 2.430724620819092 }, { "auxiliary_loss_clip": 0.01079651, "auxiliary_loss_mlp": 0.01039548, "balance_loss_clip": 1.02189898, "balance_loss_mlp": 1.02653515, "epoch": 0.3476326469261987, "flos": 19827174712320.0, "grad_norm": 1.8761444313037403, "language_loss": 0.76921785, "learning_rate": 2.9215882991021036e-06, "loss": 0.7904098, "num_input_tokens_seen": 124474870, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.53125, "step": 5782, "time_per_iteration": 2.3804807662963867 }, { "auxiliary_loss_clip": 0.01075955, "auxiliary_loss_mlp": 0.01031543, "balance_loss_clip": 1.01630783, "balance_loss_mlp": 1.02366221, "epoch": 0.3476927701788667, "flos": 19937081272320.0, "grad_norm": 1.8881325478499325, "language_loss": 0.62519693, "learning_rate": 2.9212529626827582e-06, "loss": 0.64627182, "num_input_tokens_seen": 124494105, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.5234375, "step": 5783, "time_per_iteration": 2.4045162200927734 }, { "auxiliary_loss_clip": 0.01073196, "auxiliary_loss_mlp": 0.0102567, "balance_loss_clip": 1.01146054, "balance_loss_mlp": 1.02310634, "epoch": 0.34775289343153465, "flos": 20734218501120.0, "grad_norm": 1.6226022829266527, "language_loss": 0.88506716, "learning_rate": 2.9209175933862636e-06, "loss": 0.90605581, "num_input_tokens_seen": 124512030, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.5, "step": 5784, "time_per_iteration": 3.7488017082214355 }, { "auxiliary_loss_clip": 0.01074304, "auxiliary_loss_mlp": 0.01029736, "balance_loss_clip": 1.01439333, "balance_loss_mlp": 1.02360535, "epoch": 0.3478130166842026, "flos": 19353822232320.0, "grad_norm": 1.5938023869846187, "language_loss": 0.81219316, "learning_rate": 2.92058219122459e-06, "loss": 0.83323359, "num_input_tokens_seen": 124530980, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.5078125, "step": 5785, "time_per_iteration": 2.414529800415039 }, { "auxiliary_loss_clip": 0.01079448, "auxiliary_loss_mlp": 0.01037375, "balance_loss_clip": 1.02221203, "balance_loss_mlp": 1.02634561, "epoch": 0.3478731399368706, "flos": 22050199578240.0, "grad_norm": 1.794738943084231, "language_loss": 0.80747348, "learning_rate": 2.9202467562097052e-06, "loss": 0.82864165, "num_input_tokens_seen": 124549330, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.53125, "step": 5786, "time_per_iteration": 2.4369876384735107 }, { "auxiliary_loss_clip": 0.01076496, "auxiliary_loss_mlp": 0.01032913, "balance_loss_clip": 1.01723695, "balance_loss_mlp": 1.02581239, "epoch": 0.34793326318953854, "flos": 18040459507200.0, "grad_norm": 2.3785193508610014, "language_loss": 0.75002062, "learning_rate": 2.9199112883535813e-06, "loss": 0.77111471, "num_input_tokens_seen": 124567200, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.5078125, "step": 5787, "time_per_iteration": 5.203081369400024 }, { "auxiliary_loss_clip": 0.01076853, "auxiliary_loss_mlp": 0.01029907, "balance_loss_clip": 1.01446939, "balance_loss_mlp": 1.02461243, "epoch": 0.3479933864422065, "flos": 29313390648960.0, "grad_norm": 1.7350346884740975, "language_loss": 0.81622982, "learning_rate": 2.919575787668189e-06, "loss": 0.83729744, "num_input_tokens_seen": 124587025, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.5234375, "step": 5788, "time_per_iteration": 2.4759206771850586 }, { "auxiliary_loss_clip": 0.01081896, "auxiliary_loss_mlp": 0.01033492, "balance_loss_clip": 1.01657593, "balance_loss_mlp": 1.02689099, "epoch": 0.3480535096948745, "flos": 20119677016320.0, "grad_norm": 2.4067150975214235, "language_loss": 0.8551451, "learning_rate": 2.919240254165503e-06, "loss": 0.87629896, "num_input_tokens_seen": 124605860, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.55078125, "step": 5789, "time_per_iteration": 2.4148001670837402 }, { "auxiliary_loss_clip": 0.01079605, "auxiliary_loss_mlp": 0.01049796, "balance_loss_clip": 1.03234363, "balance_loss_mlp": 1.02595782, "epoch": 0.34811363294754244, "flos": 18548061897600.0, "grad_norm": 1.7079288661975327, "language_loss": 0.85120916, "learning_rate": 2.918904687857497e-06, "loss": 0.87250316, "num_input_tokens_seen": 124624270, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.53515625, "step": 5790, "time_per_iteration": 2.3937594890594482 }, { "auxiliary_loss_clip": 0.0107882, "auxiliary_loss_mlp": 0.01037594, "balance_loss_clip": 1.02116656, "balance_loss_mlp": 1.02550459, "epoch": 0.3481737562002104, "flos": 26869086385920.0, "grad_norm": 2.0424667302257014, "language_loss": 0.81423348, "learning_rate": 2.9185690887561463e-06, "loss": 0.8353976, "num_input_tokens_seen": 124644005, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.53125, "step": 5791, "time_per_iteration": 2.4153356552124023 }, { "auxiliary_loss_clip": 0.01077823, "auxiliary_loss_mlp": 0.01029016, "balance_loss_clip": 1.01322699, "balance_loss_mlp": 1.02349496, "epoch": 0.3482338794528784, "flos": 28907525560320.0, "grad_norm": 1.8588390468281573, "language_loss": 0.77465641, "learning_rate": 2.918233456873428e-06, "loss": 0.79572481, "num_input_tokens_seen": 124663020, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.54296875, "step": 5792, "time_per_iteration": 2.45379900932312 }, { "auxiliary_loss_clip": 0.01074614, "auxiliary_loss_mlp": 0.01027482, "balance_loss_clip": 1.01203263, "balance_loss_mlp": 1.02255368, "epoch": 0.3482940027055464, "flos": 22199662575360.0, "grad_norm": 1.6172860339390311, "language_loss": 0.81855458, "learning_rate": 2.9178977922213188e-06, "loss": 0.83957553, "num_input_tokens_seen": 124682975, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.51953125, "step": 5793, "time_per_iteration": 3.8589749336242676 }, { "auxiliary_loss_clip": 0.01077809, "auxiliary_loss_mlp": 0.01041606, "balance_loss_clip": 1.02494013, "balance_loss_mlp": 1.02469444, "epoch": 0.34835412595821436, "flos": 20301679267200.0, "grad_norm": 1.7174350608394966, "language_loss": 0.75724077, "learning_rate": 2.917562094811799e-06, "loss": 0.77843487, "num_input_tokens_seen": 124701340, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.53125, "step": 5794, "time_per_iteration": 2.424049139022827 }, { "auxiliary_loss_clip": 0.01076234, "auxiliary_loss_mlp": 0.01036285, "balance_loss_clip": 1.02066886, "balance_loss_mlp": 1.02419746, "epoch": 0.3484142492108823, "flos": 20448628646400.0, "grad_norm": 5.746312748421174, "language_loss": 0.56843466, "learning_rate": 2.917226364656848e-06, "loss": 0.58955985, "num_input_tokens_seen": 124719165, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.5234375, "step": 5795, "time_per_iteration": 2.3890457153320312 }, { "auxiliary_loss_clip": 0.0107618, "auxiliary_loss_mlp": 0.01028074, "balance_loss_clip": 1.01239753, "balance_loss_mlp": 1.02457547, "epoch": 0.3484743724635503, "flos": 24351778736640.0, "grad_norm": 1.7390058205596206, "language_loss": 0.82748753, "learning_rate": 2.9168906017684474e-06, "loss": 0.84853005, "num_input_tokens_seen": 124738670, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.515625, "step": 5796, "time_per_iteration": 2.425893545150757 }, { "auxiliary_loss_clip": 0.01074377, "auxiliary_loss_mlp": 0.01027431, "balance_loss_clip": 1.01304209, "balance_loss_mlp": 1.02409446, "epoch": 0.34853449571621825, "flos": 24351848559360.0, "grad_norm": 1.7386246320570766, "language_loss": 0.83200371, "learning_rate": 2.91655480615858e-06, "loss": 0.8530218, "num_input_tokens_seen": 124758760, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.50390625, "step": 5797, "time_per_iteration": 2.4673445224761963 }, { "auxiliary_loss_clip": 0.01073897, "auxiliary_loss_mlp": 0.0103212, "balance_loss_clip": 1.01665902, "balance_loss_mlp": 1.02394271, "epoch": 0.3485946189688862, "flos": 27266572748160.0, "grad_norm": 2.4555665395737285, "language_loss": 0.73516202, "learning_rate": 2.9162189778392286e-06, "loss": 0.75622225, "num_input_tokens_seen": 124777765, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.5, "step": 5798, "time_per_iteration": 2.4390857219696045 }, { "auxiliary_loss_clip": 0.0107446, "auxiliary_loss_mlp": 0.01032421, "balance_loss_clip": 1.01660144, "balance_loss_mlp": 1.02269697, "epoch": 0.3486547422215542, "flos": 20155672494720.0, "grad_norm": 2.020521921115232, "language_loss": 0.75903696, "learning_rate": 2.9158831168223797e-06, "loss": 0.78010577, "num_input_tokens_seen": 124796775, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.51953125, "step": 5799, "time_per_iteration": 2.391947031021118 }, { "auxiliary_loss_clip": 0.01076702, "auxiliary_loss_mlp": 0.01029917, "balance_loss_clip": 1.01552272, "balance_loss_mlp": 1.0248158, "epoch": 0.34871486547422215, "flos": 20229304285440.0, "grad_norm": 2.0517329753676483, "language_loss": 0.75460827, "learning_rate": 2.915547223120018e-06, "loss": 0.77567446, "num_input_tokens_seen": 124815825, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.51953125, "step": 5800, "time_per_iteration": 2.4259848594665527 }, { "auxiliary_loss_clip": 0.01079949, "auxiliary_loss_mlp": 0.01038756, "balance_loss_clip": 1.0218401, "balance_loss_mlp": 1.02539814, "epoch": 0.3487749887268901, "flos": 44051591208960.0, "grad_norm": 1.6037827883161688, "language_loss": 0.66977096, "learning_rate": 2.9152112967441307e-06, "loss": 0.69095802, "num_input_tokens_seen": 124838420, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.546875, "step": 5801, "time_per_iteration": 2.594118118286133 }, { "auxiliary_loss_clip": 0.01074173, "auxiliary_loss_mlp": 0.01029465, "balance_loss_clip": 1.01387203, "balance_loss_mlp": 1.02352047, "epoch": 0.3488351119795581, "flos": 23294015141760.0, "grad_norm": 1.8131761048769601, "language_loss": 0.76868844, "learning_rate": 2.9148753377067063e-06, "loss": 0.78972483, "num_input_tokens_seen": 124857320, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.5078125, "step": 5802, "time_per_iteration": 2.4443297386169434 }, { "auxiliary_loss_clip": 0.01072151, "auxiliary_loss_mlp": 0.01030415, "balance_loss_clip": 1.01556194, "balance_loss_mlp": 1.02278686, "epoch": 0.34889523523222604, "flos": 19933904338560.0, "grad_norm": 1.5974508130425775, "language_loss": 0.78238654, "learning_rate": 2.9145393460197346e-06, "loss": 0.8034122, "num_input_tokens_seen": 124875685, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.49414062, "step": 5803, "time_per_iteration": 2.3786134719848633 }, { "auxiliary_loss_clip": 0.01075542, "auxiliary_loss_mlp": 0.01032346, "balance_loss_clip": 1.01674187, "balance_loss_mlp": 1.02237821, "epoch": 0.348955358484894, "flos": 30444855857280.0, "grad_norm": 2.55264393675343, "language_loss": 0.67801392, "learning_rate": 2.914203321695206e-06, "loss": 0.69909281, "num_input_tokens_seen": 124895960, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.53125, "step": 5804, "time_per_iteration": 2.4601457118988037 }, { "auxiliary_loss_clip": 0.01072578, "auxiliary_loss_mlp": 0.0103481, "balance_loss_clip": 1.01994491, "balance_loss_mlp": 1.02311182, "epoch": 0.349015481737562, "flos": 17999122590720.0, "grad_norm": 1.7104962421911951, "language_loss": 0.76287705, "learning_rate": 2.913867264745113e-06, "loss": 0.78395092, "num_input_tokens_seen": 124914140, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.49609375, "step": 5805, "time_per_iteration": 2.388007640838623 }, { "auxiliary_loss_clip": 0.01076168, "auxiliary_loss_mlp": 0.01029521, "balance_loss_clip": 1.01469731, "balance_loss_mlp": 1.02544165, "epoch": 0.34907560499023, "flos": 27197269966080.0, "grad_norm": 4.360438706111584, "language_loss": 0.67598635, "learning_rate": 2.913531175181448e-06, "loss": 0.6970433, "num_input_tokens_seen": 124934180, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.5078125, "step": 5806, "time_per_iteration": 2.4748339653015137 }, { "auxiliary_loss_clip": 0.01077264, "auxiliary_loss_mlp": 0.01033205, "balance_loss_clip": 1.01806498, "balance_loss_mlp": 1.0251286, "epoch": 0.34913572824289796, "flos": 30225566407680.0, "grad_norm": 1.412946999041995, "language_loss": 0.71812558, "learning_rate": 2.913195053016205e-06, "loss": 0.73923028, "num_input_tokens_seen": 124956060, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.5234375, "step": 5807, "time_per_iteration": 2.527106285095215 }, { "auxiliary_loss_clip": 0.01073411, "auxiliary_loss_mlp": 0.01035095, "balance_loss_clip": 1.01941895, "balance_loss_mlp": 1.02175343, "epoch": 0.3491958514955659, "flos": 29970595681920.0, "grad_norm": 1.828088224074478, "language_loss": 0.73812759, "learning_rate": 2.9128588982613794e-06, "loss": 0.75921267, "num_input_tokens_seen": 124976070, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.515625, "step": 5808, "time_per_iteration": 2.4526548385620117 }, { "auxiliary_loss_clip": 0.01073649, "auxiliary_loss_mlp": 0.0103266, "balance_loss_clip": 1.01828885, "balance_loss_mlp": 1.02470326, "epoch": 0.3492559747482339, "flos": 22782188476800.0, "grad_norm": 1.5046794121200924, "language_loss": 0.84514034, "learning_rate": 2.912522710928968e-06, "loss": 0.86620349, "num_input_tokens_seen": 124996995, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.49023438, "step": 5809, "time_per_iteration": 2.4007365703582764 }, { "auxiliary_loss_clip": 0.0107256, "auxiliary_loss_mlp": 0.01031038, "balance_loss_clip": 1.01753116, "balance_loss_mlp": 1.0239048, "epoch": 0.34931609800090185, "flos": 26066817187200.0, "grad_norm": 1.813103706262116, "language_loss": 0.80167866, "learning_rate": 2.912186491030968e-06, "loss": 0.82271469, "num_input_tokens_seen": 125015600, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.48828125, "step": 5810, "time_per_iteration": 2.4301726818084717 }, { "auxiliary_loss_clip": 0.01072106, "auxiliary_loss_mlp": 0.01032702, "balance_loss_clip": 1.01809931, "balance_loss_mlp": 1.02249813, "epoch": 0.3493762212535698, "flos": 29240736376320.0, "grad_norm": 1.6366058780106694, "language_loss": 0.75798559, "learning_rate": 2.911850238579379e-06, "loss": 0.77903366, "num_input_tokens_seen": 125035290, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.49609375, "step": 5811, "time_per_iteration": 2.4376323223114014 }, { "auxiliary_loss_clip": 0.0107568, "auxiliary_loss_mlp": 0.0103184, "balance_loss_clip": 1.0161581, "balance_loss_mlp": 1.02264857, "epoch": 0.3494363445062378, "flos": 27124825161600.0, "grad_norm": 1.3865002619728117, "language_loss": 0.79880184, "learning_rate": 2.9115139535862003e-06, "loss": 0.81987703, "num_input_tokens_seen": 125057130, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.53125, "step": 5812, "time_per_iteration": 2.4402496814727783 }, { "auxiliary_loss_clip": 0.01073742, "auxiliary_loss_mlp": 0.01031055, "balance_loss_clip": 1.01657104, "balance_loss_mlp": 1.02258897, "epoch": 0.34949646775890575, "flos": 12275391409920.0, "grad_norm": 1.907659193732723, "language_loss": 0.69322318, "learning_rate": 2.9111776360634334e-06, "loss": 0.71427113, "num_input_tokens_seen": 125073720, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.51171875, "step": 5813, "time_per_iteration": 2.3609864711761475 }, { "auxiliary_loss_clip": 0.01070174, "auxiliary_loss_mlp": 0.01029968, "balance_loss_clip": 1.01554418, "balance_loss_mlp": 1.02205002, "epoch": 0.3495565910115737, "flos": 17164558517760.0, "grad_norm": 1.864892791019543, "language_loss": 0.76173961, "learning_rate": 2.9108412860230806e-06, "loss": 0.78274101, "num_input_tokens_seen": 125090635, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.48046875, "step": 5814, "time_per_iteration": 2.375277519226074 }, { "auxiliary_loss_clip": 0.01075564, "auxiliary_loss_mlp": 0.01036629, "balance_loss_clip": 1.02041626, "balance_loss_mlp": 1.0230937, "epoch": 0.3496167142642417, "flos": 26464547928960.0, "grad_norm": 1.641588965021327, "language_loss": 0.84404933, "learning_rate": 2.910504903477145e-06, "loss": 0.86517131, "num_input_tokens_seen": 125110070, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.5234375, "step": 5815, "time_per_iteration": 2.434227466583252 }, { "auxiliary_loss_clip": 0.01070806, "auxiliary_loss_mlp": 0.01028793, "balance_loss_clip": 1.01516712, "balance_loss_mlp": 1.02113342, "epoch": 0.34967683751690964, "flos": 17414048160000.0, "grad_norm": 1.9441700650437517, "language_loss": 0.77666688, "learning_rate": 2.910168488437632e-06, "loss": 0.79766285, "num_input_tokens_seen": 125125730, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.49609375, "step": 5816, "time_per_iteration": 2.381499767303467 }, { "auxiliary_loss_clip": 0.01074997, "auxiliary_loss_mlp": 0.01029586, "balance_loss_clip": 1.01498866, "balance_loss_mlp": 1.02349591, "epoch": 0.3497369607695776, "flos": 22598964328320.0, "grad_norm": 1.8082848661862878, "language_loss": 0.58813262, "learning_rate": 2.9098320409165462e-06, "loss": 0.60917848, "num_input_tokens_seen": 125146195, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.515625, "step": 5817, "time_per_iteration": 2.3892834186553955 }, { "auxiliary_loss_clip": 0.0101592, "auxiliary_loss_mlp": 0.01009118, "balance_loss_clip": 1.00780642, "balance_loss_mlp": 1.00540102, "epoch": 0.34979708402224563, "flos": 68526193708800.0, "grad_norm": 0.87321725435861, "language_loss": 0.59851706, "learning_rate": 2.9094955609258954e-06, "loss": 0.6187675, "num_input_tokens_seen": 125207790, "router_z_loss_clip": 0.01312256, "router_z_loss_mlp": 0.10546875, "step": 5818, "time_per_iteration": 3.0726945400238037 }, { "auxiliary_loss_clip": 0.01070647, "auxiliary_loss_mlp": 0.01026739, "balance_loss_clip": 1.01235628, "balance_loss_mlp": 1.02256739, "epoch": 0.3498572072749136, "flos": 18988630744320.0, "grad_norm": 1.996602069957389, "language_loss": 0.83271128, "learning_rate": 2.909159048477688e-06, "loss": 0.85368514, "num_input_tokens_seen": 125226220, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.48046875, "step": 5819, "time_per_iteration": 2.385380744934082 }, { "auxiliary_loss_clip": 0.01071479, "auxiliary_loss_mlp": 0.0102749, "balance_loss_clip": 1.01358426, "balance_loss_mlp": 1.0221417, "epoch": 0.34991733052758156, "flos": 27817641648000.0, "grad_norm": 2.0543557592234083, "language_loss": 0.71232194, "learning_rate": 2.9088225035839327e-06, "loss": 0.73331165, "num_input_tokens_seen": 125247485, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.4921875, "step": 5820, "time_per_iteration": 2.441352128982544 }, { "auxiliary_loss_clip": 0.01073675, "auxiliary_loss_mlp": 0.01029408, "balance_loss_clip": 1.01569295, "balance_loss_mlp": 1.02273846, "epoch": 0.3499774537802495, "flos": 33582779568000.0, "grad_norm": 1.650895162186316, "language_loss": 0.70354503, "learning_rate": 2.9084859262566397e-06, "loss": 0.72457588, "num_input_tokens_seen": 125268625, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.5078125, "step": 5821, "time_per_iteration": 2.4730236530303955 }, { "auxiliary_loss_clip": 0.01079223, "auxiliary_loss_mlp": 0.0103638, "balance_loss_clip": 1.01930976, "balance_loss_mlp": 1.02490127, "epoch": 0.3500375770329175, "flos": 23475633367680.0, "grad_norm": 1.9285327056340842, "language_loss": 0.73762476, "learning_rate": 2.9081493165078216e-06, "loss": 0.75878084, "num_input_tokens_seen": 125287530, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.54296875, "step": 5822, "time_per_iteration": 2.4335241317749023 }, { "auxiliary_loss_clip": 0.01075611, "auxiliary_loss_mlp": 0.0102836, "balance_loss_clip": 1.01213551, "balance_loss_mlp": 1.02340245, "epoch": 0.35009770028558546, "flos": 19025045159040.0, "grad_norm": 2.431197042198075, "language_loss": 0.78223145, "learning_rate": 2.907812674349489e-06, "loss": 0.80327117, "num_input_tokens_seen": 125307020, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.51953125, "step": 5823, "time_per_iteration": 3.828317165374756 }, { "auxiliary_loss_clip": 0.01011293, "auxiliary_loss_mlp": 0.01006719, "balance_loss_clip": 1.00555038, "balance_loss_mlp": 1.00143886, "epoch": 0.3501578235382534, "flos": 68348555377920.0, "grad_norm": 0.7144161359937008, "language_loss": 0.5924117, "learning_rate": 2.907475999793659e-06, "loss": 0.6125918, "num_input_tokens_seen": 125370445, "router_z_loss_clip": 0.01165771, "router_z_loss_mlp": 0.09863281, "step": 5824, "time_per_iteration": 3.057630777359009 }, { "auxiliary_loss_clip": 0.01074372, "auxiliary_loss_mlp": 0.01028096, "balance_loss_clip": 1.01267076, "balance_loss_mlp": 1.02360642, "epoch": 0.3502179467909214, "flos": 21249850504320.0, "grad_norm": 2.0157943792023287, "language_loss": 0.84892666, "learning_rate": 2.9071392928523433e-06, "loss": 0.86995137, "num_input_tokens_seen": 125388900, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.5078125, "step": 5825, "time_per_iteration": 2.445631742477417 }, { "auxiliary_loss_clip": 0.01075198, "auxiliary_loss_mlp": 0.0102548, "balance_loss_clip": 1.01110983, "balance_loss_mlp": 1.02463436, "epoch": 0.35027807004358935, "flos": 11942285328000.0, "grad_norm": 2.8645329266613824, "language_loss": 0.83159238, "learning_rate": 2.9068025535375603e-06, "loss": 0.8525992, "num_input_tokens_seen": 125402675, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.5078125, "step": 5826, "time_per_iteration": 2.364750385284424 }, { "auxiliary_loss_clip": 0.01076329, "auxiliary_loss_mlp": 0.01035671, "balance_loss_clip": 1.02020931, "balance_loss_mlp": 1.02465343, "epoch": 0.3503381932962573, "flos": 21469838181120.0, "grad_norm": 1.482831130007531, "language_loss": 0.808658, "learning_rate": 2.9064657818613274e-06, "loss": 0.82977796, "num_input_tokens_seen": 125421360, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.515625, "step": 5827, "time_per_iteration": 5.1953113079071045 }, { "auxiliary_loss_clip": 0.0107525, "auxiliary_loss_mlp": 0.01028869, "balance_loss_clip": 1.01473665, "balance_loss_mlp": 1.02498889, "epoch": 0.3503983165489253, "flos": 21250059972480.0, "grad_norm": 3.244150573028192, "language_loss": 0.70889628, "learning_rate": 2.906128977835661e-06, "loss": 0.72993743, "num_input_tokens_seen": 125440000, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.50390625, "step": 5828, "time_per_iteration": 2.397907018661499 }, { "auxiliary_loss_clip": 0.01079964, "auxiliary_loss_mlp": 0.01033352, "balance_loss_clip": 1.01635885, "balance_loss_mlp": 1.02630138, "epoch": 0.35045843980159325, "flos": 27814569448320.0, "grad_norm": 1.781232522570234, "language_loss": 0.79580939, "learning_rate": 2.9057921414725838e-06, "loss": 0.81694257, "num_input_tokens_seen": 125460390, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.5390625, "step": 5829, "time_per_iteration": 2.4365835189819336 }, { "auxiliary_loss_clip": 0.01077453, "auxiliary_loss_mlp": 0.01040442, "balance_loss_clip": 1.02403903, "balance_loss_mlp": 1.02434659, "epoch": 0.3505185630542612, "flos": 25919972542080.0, "grad_norm": 2.0281747068118174, "language_loss": 0.72217435, "learning_rate": 2.9054552727841136e-06, "loss": 0.74335325, "num_input_tokens_seen": 125478410, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.53125, "step": 5830, "time_per_iteration": 2.428008794784546 }, { "auxiliary_loss_clip": 0.01074093, "auxiliary_loss_mlp": 0.01026985, "balance_loss_clip": 1.0122267, "balance_loss_mlp": 1.02410579, "epoch": 0.35057868630692923, "flos": 20520724337280.0, "grad_norm": 2.4031299902788508, "language_loss": 0.88610458, "learning_rate": 2.905118371782275e-06, "loss": 0.90711534, "num_input_tokens_seen": 125495975, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.5, "step": 5831, "time_per_iteration": 2.3890857696533203 }, { "auxiliary_loss_clip": 0.01073385, "auxiliary_loss_mlp": 0.0103394, "balance_loss_clip": 1.01850271, "balance_loss_mlp": 1.02219677, "epoch": 0.3506388095595972, "flos": 20447616216960.0, "grad_norm": 1.8117341828354205, "language_loss": 0.78457981, "learning_rate": 2.9047814384790894e-06, "loss": 0.8056531, "num_input_tokens_seen": 125515035, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.51171875, "step": 5832, "time_per_iteration": 3.911949872970581 }, { "auxiliary_loss_clip": 0.01074548, "auxiliary_loss_mlp": 0.01038139, "balance_loss_clip": 1.02183068, "balance_loss_mlp": 1.02344561, "epoch": 0.35069893281226516, "flos": 23108626488960.0, "grad_norm": 3.267834556160976, "language_loss": 0.70697343, "learning_rate": 2.9044444728865814e-06, "loss": 0.7281003, "num_input_tokens_seen": 125535555, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.51171875, "step": 5833, "time_per_iteration": 2.3969180583953857 }, { "auxiliary_loss_clip": 0.01073233, "auxiliary_loss_mlp": 0.01028039, "balance_loss_clip": 1.01367474, "balance_loss_mlp": 1.02462578, "epoch": 0.35075905606493313, "flos": 27270762111360.0, "grad_norm": 1.395886004571591, "language_loss": 0.80790132, "learning_rate": 2.904107475016777e-06, "loss": 0.82891405, "num_input_tokens_seen": 125558195, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.48632812, "step": 5834, "time_per_iteration": 2.6021265983581543 }, { "auxiliary_loss_clip": 0.01074976, "auxiliary_loss_mlp": 0.01032193, "balance_loss_clip": 1.01661253, "balance_loss_mlp": 1.02431047, "epoch": 0.3508191793176011, "flos": 19127794890240.0, "grad_norm": 1.9985227525423257, "language_loss": 0.8406868, "learning_rate": 2.903770444881702e-06, "loss": 0.86175847, "num_input_tokens_seen": 125575375, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.5078125, "step": 5835, "time_per_iteration": 2.368988037109375 }, { "auxiliary_loss_clip": 0.01072856, "auxiliary_loss_mlp": 0.0103457, "balance_loss_clip": 1.01981819, "balance_loss_mlp": 1.02323973, "epoch": 0.35087930257026906, "flos": 25556386976640.0, "grad_norm": 1.4509454469989345, "language_loss": 0.76626897, "learning_rate": 2.903433382493386e-06, "loss": 0.78734314, "num_input_tokens_seen": 125596745, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.49609375, "step": 5836, "time_per_iteration": 2.5065622329711914 }, { "auxiliary_loss_clip": 0.0107682, "auxiliary_loss_mlp": 0.01030679, "balance_loss_clip": 1.01526499, "balance_loss_mlp": 1.02583432, "epoch": 0.350939425822937, "flos": 18003277042560.0, "grad_norm": 1.9367643988797245, "language_loss": 0.77378464, "learning_rate": 2.903096287863855e-06, "loss": 0.79485965, "num_input_tokens_seen": 125613980, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.51171875, "step": 5837, "time_per_iteration": 2.370356798171997 }, { "auxiliary_loss_clip": 0.01073095, "auxiliary_loss_mlp": 0.01028777, "balance_loss_clip": 1.01420355, "balance_loss_mlp": 1.02333236, "epoch": 0.350999549075605, "flos": 22272107379840.0, "grad_norm": 1.7659622437642473, "language_loss": 0.67823792, "learning_rate": 2.902759161005141e-06, "loss": 0.69925666, "num_input_tokens_seen": 125632100, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.49804688, "step": 5838, "time_per_iteration": 2.4062552452087402 }, { "auxiliary_loss_clip": 0.01075698, "auxiliary_loss_mlp": 0.0102871, "balance_loss_clip": 1.01408279, "balance_loss_mlp": 1.02416635, "epoch": 0.35105967232827295, "flos": 14391407358720.0, "grad_norm": 2.078667375600241, "language_loss": 0.83085197, "learning_rate": 2.9024220019292752e-06, "loss": 0.85189605, "num_input_tokens_seen": 125649190, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.515625, "step": 5839, "time_per_iteration": 2.354326009750366 }, { "auxiliary_loss_clip": 0.01076081, "auxiliary_loss_mlp": 0.01034223, "balance_loss_clip": 1.01841605, "balance_loss_mlp": 1.02299321, "epoch": 0.3511197955809409, "flos": 25081184194560.0, "grad_norm": 1.6183001641249404, "language_loss": 0.59279394, "learning_rate": 2.902084810648289e-06, "loss": 0.61389709, "num_input_tokens_seen": 125668680, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.53125, "step": 5840, "time_per_iteration": 2.420515537261963 }, { "auxiliary_loss_clip": 0.01073923, "auxiliary_loss_mlp": 0.01031403, "balance_loss_clip": 1.01645458, "balance_loss_mlp": 1.02264929, "epoch": 0.3511799188336089, "flos": 25882999545600.0, "grad_norm": 2.231764468532241, "language_loss": 0.87437105, "learning_rate": 2.901747587174216e-06, "loss": 0.89542437, "num_input_tokens_seen": 125686935, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.51171875, "step": 5841, "time_per_iteration": 2.412553310394287 }, { "auxiliary_loss_clip": 0.0107715, "auxiliary_loss_mlp": 0.01030999, "balance_loss_clip": 1.01469123, "balance_loss_mlp": 1.02402568, "epoch": 0.35124004208627685, "flos": 20082704019840.0, "grad_norm": 1.8000295990916717, "language_loss": 0.75027156, "learning_rate": 2.9014103315190916e-06, "loss": 0.77135301, "num_input_tokens_seen": 125707180, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.53125, "step": 5842, "time_per_iteration": 2.4183547496795654 }, { "auxiliary_loss_clip": 0.01075919, "auxiliary_loss_mlp": 0.01035649, "balance_loss_clip": 1.02070045, "balance_loss_mlp": 1.02348018, "epoch": 0.3513001653389448, "flos": 17782521315840.0, "grad_norm": 2.6569198410275496, "language_loss": 0.68676543, "learning_rate": 2.9010730436949514e-06, "loss": 0.70788109, "num_input_tokens_seen": 125722780, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.5234375, "step": 5843, "time_per_iteration": 2.3426473140716553 }, { "auxiliary_loss_clip": 0.01075804, "auxiliary_loss_mlp": 0.01034453, "balance_loss_clip": 1.01873565, "balance_loss_mlp": 1.02483296, "epoch": 0.3513602885916128, "flos": 29385870364800.0, "grad_norm": 1.9303890909133155, "language_loss": 0.65193594, "learning_rate": 2.900735723713832e-06, "loss": 0.67303848, "num_input_tokens_seen": 125742110, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.51171875, "step": 5844, "time_per_iteration": 2.4219937324523926 }, { "auxiliary_loss_clip": 0.01075904, "auxiliary_loss_mlp": 0.01034569, "balance_loss_clip": 1.01880348, "balance_loss_mlp": 1.02411294, "epoch": 0.3514204118442808, "flos": 16178960436480.0, "grad_norm": 1.8652018751226302, "language_loss": 0.75374216, "learning_rate": 2.9003983715877713e-06, "loss": 0.77484691, "num_input_tokens_seen": 125759980, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.515625, "step": 5845, "time_per_iteration": 2.391521692276001 }, { "auxiliary_loss_clip": 0.01074055, "auxiliary_loss_mlp": 0.01035567, "balance_loss_clip": 1.02029073, "balance_loss_mlp": 1.02443659, "epoch": 0.35148053509694877, "flos": 23833737849600.0, "grad_norm": 2.568877763249513, "language_loss": 0.73095214, "learning_rate": 2.9000609873288085e-06, "loss": 0.75204837, "num_input_tokens_seen": 125772660, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.49609375, "step": 5846, "time_per_iteration": 2.373753547668457 }, { "auxiliary_loss_clip": 0.01076269, "auxiliary_loss_mlp": 0.01032295, "balance_loss_clip": 1.0172565, "balance_loss_mlp": 1.02514589, "epoch": 0.35154065834961673, "flos": 20990376213120.0, "grad_norm": 1.6068390241702384, "language_loss": 0.75692546, "learning_rate": 2.8997235709489845e-06, "loss": 0.77801108, "num_input_tokens_seen": 125791935, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.5078125, "step": 5847, "time_per_iteration": 2.4587488174438477 }, { "auxiliary_loss_clip": 0.0107543, "auxiliary_loss_mlp": 0.01030056, "balance_loss_clip": 1.01584053, "balance_loss_mlp": 1.02389681, "epoch": 0.3516007816022847, "flos": 33254072317440.0, "grad_norm": 2.1191199957805007, "language_loss": 0.7246365, "learning_rate": 2.8993861224603412e-06, "loss": 0.74569136, "num_input_tokens_seen": 125813455, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.515625, "step": 5848, "time_per_iteration": 2.530982494354248 }, { "auxiliary_loss_clip": 0.01079916, "auxiliary_loss_mlp": 0.01037422, "balance_loss_clip": 1.01999366, "balance_loss_mlp": 1.02587247, "epoch": 0.35166090485495266, "flos": 11726207723520.0, "grad_norm": 28.206760223459955, "language_loss": 0.90069497, "learning_rate": 2.8990486418749205e-06, "loss": 0.92186832, "num_input_tokens_seen": 125827660, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.5390625, "step": 5849, "time_per_iteration": 2.4092886447906494 }, { "auxiliary_loss_clip": 0.01074253, "auxiliary_loss_mlp": 0.01029541, "balance_loss_clip": 1.01482427, "balance_loss_mlp": 1.02365971, "epoch": 0.3517210281076206, "flos": 22637333779200.0, "grad_norm": 2.035788367149342, "language_loss": 0.75290322, "learning_rate": 2.8987111292047663e-06, "loss": 0.77394116, "num_input_tokens_seen": 125846655, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.5078125, "step": 5850, "time_per_iteration": 2.414071798324585 }, { "auxiliary_loss_clip": 0.0107581, "auxiliary_loss_mlp": 0.01029625, "balance_loss_clip": 1.01503372, "balance_loss_mlp": 1.02625632, "epoch": 0.3517811513602886, "flos": 21321736727040.0, "grad_norm": 1.4213073000169885, "language_loss": 0.75725776, "learning_rate": 2.898373584461924e-06, "loss": 0.77831215, "num_input_tokens_seen": 125866290, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.49609375, "step": 5851, "time_per_iteration": 2.4352986812591553 }, { "auxiliary_loss_clip": 0.01078671, "auxiliary_loss_mlp": 0.01032765, "balance_loss_clip": 1.01679087, "balance_loss_mlp": 1.02562332, "epoch": 0.35184127461295656, "flos": 21031817863680.0, "grad_norm": 1.9879022840375544, "language_loss": 0.87389195, "learning_rate": 2.8980360076584384e-06, "loss": 0.8950063, "num_input_tokens_seen": 125884620, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.53125, "step": 5852, "time_per_iteration": 2.383826732635498 }, { "auxiliary_loss_clip": 0.01073404, "auxiliary_loss_mlp": 0.01027158, "balance_loss_clip": 1.0128293, "balance_loss_mlp": 1.02399099, "epoch": 0.3519013978656245, "flos": 22454179453440.0, "grad_norm": 2.0853284929415112, "language_loss": 0.67925978, "learning_rate": 2.8976983988063586e-06, "loss": 0.70026541, "num_input_tokens_seen": 125902430, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.49414062, "step": 5853, "time_per_iteration": 2.4830963611602783 }, { "auxiliary_loss_clip": 0.01074363, "auxiliary_loss_mlp": 0.01034063, "balance_loss_clip": 1.01886415, "balance_loss_mlp": 1.02310848, "epoch": 0.3519615211182925, "flos": 13114459048320.0, "grad_norm": 1.5559215025064466, "language_loss": 0.80806428, "learning_rate": 2.8973607579177317e-06, "loss": 0.82914853, "num_input_tokens_seen": 125920570, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.51171875, "step": 5854, "time_per_iteration": 2.380789041519165 }, { "auxiliary_loss_clip": 0.0107246, "auxiliary_loss_mlp": 0.01028949, "balance_loss_clip": 1.01477504, "balance_loss_mlp": 1.02247334, "epoch": 0.35202164437096045, "flos": 19134148757760.0, "grad_norm": 1.440726560838551, "language_loss": 0.73182976, "learning_rate": 2.8970230850046076e-06, "loss": 0.75284386, "num_input_tokens_seen": 125939800, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.5, "step": 5855, "time_per_iteration": 2.39998197555542 }, { "auxiliary_loss_clip": 0.01071977, "auxiliary_loss_mlp": 0.01030791, "balance_loss_clip": 1.01595569, "balance_loss_mlp": 1.02250242, "epoch": 0.3520817676236284, "flos": 26540972628480.0, "grad_norm": 2.2127951257077836, "language_loss": 0.71072859, "learning_rate": 2.896685380079037e-06, "loss": 0.73175633, "num_input_tokens_seen": 125958720, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.49414062, "step": 5856, "time_per_iteration": 2.4252126216888428 }, { "auxiliary_loss_clip": 0.01076514, "auxiliary_loss_mlp": 0.01035549, "balance_loss_clip": 1.01808488, "balance_loss_mlp": 1.02447963, "epoch": 0.3521418908762964, "flos": 44891776010880.0, "grad_norm": 1.711461703634843, "language_loss": 0.61526108, "learning_rate": 2.896347643153072e-06, "loss": 0.63638175, "num_input_tokens_seen": 125984310, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.51953125, "step": 5857, "time_per_iteration": 2.608412265777588 }, { "auxiliary_loss_clip": 0.01074462, "auxiliary_loss_mlp": 0.01030573, "balance_loss_clip": 1.01489115, "balance_loss_mlp": 1.02292824, "epoch": 0.3522020141289644, "flos": 20186536003200.0, "grad_norm": 1.9383542983882343, "language_loss": 0.73661101, "learning_rate": 2.896009874238765e-06, "loss": 0.75766134, "num_input_tokens_seen": 126002410, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.515625, "step": 5858, "time_per_iteration": 2.383868932723999 }, { "auxiliary_loss_clip": 0.0107661, "auxiliary_loss_mlp": 0.0103506, "balance_loss_clip": 1.01953936, "balance_loss_mlp": 1.02272761, "epoch": 0.35226213738163237, "flos": 27562670922240.0, "grad_norm": 1.5275005693541637, "language_loss": 0.76397693, "learning_rate": 2.8956720733481707e-06, "loss": 0.78509367, "num_input_tokens_seen": 126022490, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.5390625, "step": 5859, "time_per_iteration": 2.433192253112793 }, { "auxiliary_loss_clip": 0.01081974, "auxiliary_loss_mlp": 0.01039412, "balance_loss_clip": 1.02261555, "balance_loss_mlp": 1.02637196, "epoch": 0.35232226063430033, "flos": 22965203157120.0, "grad_norm": 1.6797592247291404, "language_loss": 0.72011071, "learning_rate": 2.895334240493344e-06, "loss": 0.74132454, "num_input_tokens_seen": 126042895, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.5546875, "step": 5860, "time_per_iteration": 2.3874709606170654 }, { "auxiliary_loss_clip": 0.01077176, "auxiliary_loss_mlp": 0.01033995, "balance_loss_clip": 1.0176518, "balance_loss_mlp": 1.02230132, "epoch": 0.3523823838869683, "flos": 19167386238720.0, "grad_norm": 2.348800594671615, "language_loss": 0.66274589, "learning_rate": 2.8949963756863414e-06, "loss": 0.68385756, "num_input_tokens_seen": 126060130, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.546875, "step": 5861, "time_per_iteration": 2.3611271381378174 }, { "auxiliary_loss_clip": 0.01073783, "auxiliary_loss_mlp": 0.01028534, "balance_loss_clip": 1.01421082, "balance_loss_mlp": 1.02372766, "epoch": 0.35244250713963626, "flos": 17930029276800.0, "grad_norm": 1.7706961305058337, "language_loss": 0.67007422, "learning_rate": 2.8946584789392197e-06, "loss": 0.69109738, "num_input_tokens_seen": 126077850, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.5, "step": 5862, "time_per_iteration": 2.363870620727539 }, { "auxiliary_loss_clip": 0.01077104, "auxiliary_loss_mlp": 0.01032803, "balance_loss_clip": 1.0167222, "balance_loss_mlp": 1.02441454, "epoch": 0.35250263039230423, "flos": 21431503641600.0, "grad_norm": 2.3277378732391774, "language_loss": 0.77282941, "learning_rate": 2.894320550264039e-06, "loss": 0.7939285, "num_input_tokens_seen": 126095985, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.52734375, "step": 5863, "time_per_iteration": 3.743237018585205 }, { "auxiliary_loss_clip": 0.01076797, "auxiliary_loss_mlp": 0.01035159, "balance_loss_clip": 1.0196079, "balance_loss_mlp": 1.02465439, "epoch": 0.3525627536449722, "flos": 27415651720320.0, "grad_norm": 1.6787246104310511, "language_loss": 0.74978757, "learning_rate": 2.893982589672858e-06, "loss": 0.77090716, "num_input_tokens_seen": 126116070, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.5234375, "step": 5864, "time_per_iteration": 2.422184705734253 }, { "auxiliary_loss_clip": 0.01074159, "auxiliary_loss_mlp": 0.01037481, "balance_loss_clip": 1.02261543, "balance_loss_mlp": 1.02355051, "epoch": 0.35262287689764016, "flos": 24788681890560.0, "grad_norm": 2.15381827712521, "language_loss": 0.79011428, "learning_rate": 2.893644597177738e-06, "loss": 0.81123072, "num_input_tokens_seen": 126135205, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.50390625, "step": 5865, "time_per_iteration": 2.4226791858673096 }, { "auxiliary_loss_clip": 0.01078665, "auxiliary_loss_mlp": 0.01033594, "balance_loss_clip": 1.01717925, "balance_loss_mlp": 1.02550244, "epoch": 0.3526830001503081, "flos": 17820646387200.0, "grad_norm": 1.883785408154615, "language_loss": 0.80973965, "learning_rate": 2.8933065727907417e-06, "loss": 0.83086228, "num_input_tokens_seen": 126151895, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.53125, "step": 5866, "time_per_iteration": 3.7610630989074707 }, { "auxiliary_loss_clip": 0.01079901, "auxiliary_loss_mlp": 0.01033245, "balance_loss_clip": 1.0147202, "balance_loss_mlp": 1.02367556, "epoch": 0.3527431234029761, "flos": 18077118301440.0, "grad_norm": 2.086356827126721, "language_loss": 0.83860362, "learning_rate": 2.8929685165239308e-06, "loss": 0.85973513, "num_input_tokens_seen": 126168515, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.5625, "step": 5867, "time_per_iteration": 3.7440896034240723 }, { "auxiliary_loss_clip": 0.01077859, "auxiliary_loss_mlp": 0.01033803, "balance_loss_clip": 1.01700675, "balance_loss_mlp": 1.02475953, "epoch": 0.35280324665564405, "flos": 19426336859520.0, "grad_norm": 1.661167565472378, "language_loss": 0.7399323, "learning_rate": 2.892630428389371e-06, "loss": 0.76104897, "num_input_tokens_seen": 126186460, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.53125, "step": 5868, "time_per_iteration": 2.3896663188934326 }, { "auxiliary_loss_clip": 0.01077802, "auxiliary_loss_mlp": 0.0103076, "balance_loss_clip": 1.01446378, "balance_loss_mlp": 1.02448463, "epoch": 0.352863369908312, "flos": 21503040750720.0, "grad_norm": 2.6046129995476885, "language_loss": 0.61233103, "learning_rate": 2.892292308399127e-06, "loss": 0.63341665, "num_input_tokens_seen": 126206170, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.53515625, "step": 5869, "time_per_iteration": 2.4027748107910156 }, { "auxiliary_loss_clip": 0.01076204, "auxiliary_loss_mlp": 0.01034791, "balance_loss_clip": 1.01955581, "balance_loss_mlp": 1.02358913, "epoch": 0.35292349316098, "flos": 22308417060480.0, "grad_norm": 2.048940801439133, "language_loss": 0.74459761, "learning_rate": 2.8919541565652655e-06, "loss": 0.76570749, "num_input_tokens_seen": 126225605, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.52734375, "step": 5870, "time_per_iteration": 2.402498960494995 }, { "auxiliary_loss_clip": 0.01074536, "auxiliary_loss_mlp": 0.01031378, "balance_loss_clip": 1.01572561, "balance_loss_mlp": 1.02281451, "epoch": 0.352983616413648, "flos": 33108344835840.0, "grad_norm": 1.591893236940386, "language_loss": 0.71751237, "learning_rate": 2.8916159728998555e-06, "loss": 0.73857152, "num_input_tokens_seen": 126250230, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.515625, "step": 5871, "time_per_iteration": 3.878713369369507 }, { "auxiliary_loss_clip": 0.01071878, "auxiliary_loss_mlp": 0.01029749, "balance_loss_clip": 1.01571822, "balance_loss_mlp": 1.02270341, "epoch": 0.35304373966631597, "flos": 18695639681280.0, "grad_norm": 1.8012887879951427, "language_loss": 0.73728526, "learning_rate": 2.8912777574149642e-06, "loss": 0.75830156, "num_input_tokens_seen": 126268315, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.4921875, "step": 5872, "time_per_iteration": 2.3558521270751953 }, { "auxiliary_loss_clip": 0.01073731, "auxiliary_loss_mlp": 0.01032379, "balance_loss_clip": 1.01740646, "balance_loss_mlp": 1.02281487, "epoch": 0.35310386291898394, "flos": 23363911416960.0, "grad_norm": 1.6723231977884612, "language_loss": 0.82846761, "learning_rate": 2.8909395101226628e-06, "loss": 0.84952873, "num_input_tokens_seen": 126288390, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.5078125, "step": 5873, "time_per_iteration": 2.396043539047241 }, { "auxiliary_loss_clip": 0.01079075, "auxiliary_loss_mlp": 0.0103179, "balance_loss_clip": 1.0153271, "balance_loss_mlp": 1.02438068, "epoch": 0.3531639861716519, "flos": 24460812512640.0, "grad_norm": 1.9535666503686573, "language_loss": 0.66170931, "learning_rate": 2.8906012310350212e-06, "loss": 0.68281794, "num_input_tokens_seen": 126305750, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.546875, "step": 5874, "time_per_iteration": 2.390150785446167 }, { "auxiliary_loss_clip": 0.01012705, "auxiliary_loss_mlp": 0.01000866, "balance_loss_clip": 0.99954325, "balance_loss_mlp": 1.002509, "epoch": 0.35322410942431987, "flos": 70309347955200.0, "grad_norm": 0.9091830235546952, "language_loss": 0.61591953, "learning_rate": 2.890262920164113e-06, "loss": 0.63605529, "num_input_tokens_seen": 126362495, "router_z_loss_clip": 0.01324463, "router_z_loss_mlp": 0.1015625, "step": 5875, "time_per_iteration": 2.942614793777466 }, { "auxiliary_loss_clip": 0.0107746, "auxiliary_loss_mlp": 0.0103055, "balance_loss_clip": 1.01539278, "balance_loss_mlp": 1.02466393, "epoch": 0.35328423267698783, "flos": 19820087706240.0, "grad_norm": 1.8210745894631823, "language_loss": 0.7979157, "learning_rate": 2.8899245775220113e-06, "loss": 0.81899577, "num_input_tokens_seen": 126378320, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.52734375, "step": 5876, "time_per_iteration": 2.3559935092926025 }, { "auxiliary_loss_clip": 0.01012388, "auxiliary_loss_mlp": 0.01000726, "balance_loss_clip": 0.99947476, "balance_loss_mlp": 1.00215995, "epoch": 0.3533443559296558, "flos": 60823516043520.0, "grad_norm": 0.6723607187770596, "language_loss": 0.56811762, "learning_rate": 2.8895862031207906e-06, "loss": 0.58824879, "num_input_tokens_seen": 126442735, "router_z_loss_clip": 0.01251221, "router_z_loss_mlp": 0.10253906, "step": 5877, "time_per_iteration": 3.133697748184204 }, { "auxiliary_loss_clip": 0.01076172, "auxiliary_loss_mlp": 0.01029721, "balance_loss_clip": 1.01378298, "balance_loss_mlp": 1.02378035, "epoch": 0.35340447918232376, "flos": 24754571625600.0, "grad_norm": 1.701678658638952, "language_loss": 0.719262, "learning_rate": 2.889247796972527e-06, "loss": 0.74032098, "num_input_tokens_seen": 126463090, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5234375, "step": 5878, "time_per_iteration": 2.4063687324523926 }, { "auxiliary_loss_clip": 0.01072906, "auxiliary_loss_mlp": 0.01034266, "balance_loss_clip": 1.01875675, "balance_loss_mlp": 1.02110076, "epoch": 0.3534646024349917, "flos": 21795298675200.0, "grad_norm": 1.580979400444761, "language_loss": 0.78321564, "learning_rate": 2.8889093590892965e-06, "loss": 0.80428731, "num_input_tokens_seen": 126482105, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.51953125, "step": 5879, "time_per_iteration": 2.4020001888275146 }, { "auxiliary_loss_clip": 0.01078892, "auxiliary_loss_mlp": 0.01034549, "balance_loss_clip": 1.01694214, "balance_loss_mlp": 1.0247345, "epoch": 0.3535247256876597, "flos": 20011062176640.0, "grad_norm": 2.101443437287178, "language_loss": 0.62793958, "learning_rate": 2.8885708894831776e-06, "loss": 0.64907402, "num_input_tokens_seen": 126502125, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.54296875, "step": 5880, "time_per_iteration": 2.384622812271118 }, { "auxiliary_loss_clip": 0.01074874, "auxiliary_loss_mlp": 0.01031584, "balance_loss_clip": 1.01529992, "balance_loss_mlp": 1.02300763, "epoch": 0.35358484894032766, "flos": 18186920127360.0, "grad_norm": 1.9008485433476485, "language_loss": 0.65353465, "learning_rate": 2.8882323881662496e-06, "loss": 0.67459929, "num_input_tokens_seen": 126521950, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.51953125, "step": 5881, "time_per_iteration": 2.394958734512329 }, { "auxiliary_loss_clip": 0.01072389, "auxiliary_loss_mlp": 0.01026406, "balance_loss_clip": 1.01245236, "balance_loss_mlp": 1.02308178, "epoch": 0.3536449721929956, "flos": 22819266207360.0, "grad_norm": 1.5645037558978367, "language_loss": 0.758187, "learning_rate": 2.887893855150592e-06, "loss": 0.77917492, "num_input_tokens_seen": 126542445, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.49414062, "step": 5882, "time_per_iteration": 2.385859966278076 }, { "auxiliary_loss_clip": 0.01077255, "auxiliary_loss_mlp": 0.01033896, "balance_loss_clip": 1.01841033, "balance_loss_mlp": 1.02386963, "epoch": 0.3537050954456636, "flos": 26431135891200.0, "grad_norm": 2.616980383547137, "language_loss": 0.70357871, "learning_rate": 2.8875552904482874e-06, "loss": 0.7246902, "num_input_tokens_seen": 126560690, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.53515625, "step": 5883, "time_per_iteration": 2.4271581172943115 }, { "auxiliary_loss_clip": 0.01079067, "auxiliary_loss_mlp": 0.01032945, "balance_loss_clip": 1.01709604, "balance_loss_mlp": 1.02432811, "epoch": 0.3537652186983316, "flos": 17196329721600.0, "grad_norm": 2.404685447682957, "language_loss": 0.78723383, "learning_rate": 2.8872166940714166e-06, "loss": 0.80835396, "num_input_tokens_seen": 126577620, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.546875, "step": 5884, "time_per_iteration": 2.345073699951172 }, { "auxiliary_loss_clip": 0.010778, "auxiliary_loss_mlp": 0.01033695, "balance_loss_clip": 1.01872206, "balance_loss_mlp": 1.02498007, "epoch": 0.3538253419509996, "flos": 19535754660480.0, "grad_norm": 1.9494007214503086, "language_loss": 0.75360185, "learning_rate": 2.886878066032065e-06, "loss": 0.77471679, "num_input_tokens_seen": 126596235, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.53125, "step": 5885, "time_per_iteration": 2.387208938598633 }, { "auxiliary_loss_clip": 0.01077986, "auxiliary_loss_mlp": 0.01033045, "balance_loss_clip": 1.01721382, "balance_loss_mlp": 1.02465034, "epoch": 0.35388546520366754, "flos": 12127813626240.0, "grad_norm": 2.3022689683142525, "language_loss": 0.83416253, "learning_rate": 2.8865394063423155e-06, "loss": 0.85527289, "num_input_tokens_seen": 126612830, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.53515625, "step": 5886, "time_per_iteration": 2.3495311737060547 }, { "auxiliary_loss_clip": 0.01074471, "auxiliary_loss_mlp": 0.01030524, "balance_loss_clip": 1.0141561, "balance_loss_mlp": 1.02315307, "epoch": 0.3539455884563355, "flos": 19677257867520.0, "grad_norm": 2.039683036946036, "language_loss": 0.77759564, "learning_rate": 2.8862007150142557e-06, "loss": 0.79864556, "num_input_tokens_seen": 126630910, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.51171875, "step": 5887, "time_per_iteration": 2.376030921936035 }, { "auxiliary_loss_clip": 0.0107524, "auxiliary_loss_mlp": 0.0103677, "balance_loss_clip": 1.02040219, "balance_loss_mlp": 1.02309012, "epoch": 0.35400571170900347, "flos": 18071218281600.0, "grad_norm": 1.8087145519029344, "language_loss": 0.65876943, "learning_rate": 2.885861992059972e-06, "loss": 0.67988944, "num_input_tokens_seen": 126648365, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.5234375, "step": 5888, "time_per_iteration": 2.3508224487304688 }, { "auxiliary_loss_clip": 0.01074859, "auxiliary_loss_mlp": 0.01034565, "balance_loss_clip": 1.01972389, "balance_loss_mlp": 1.02368033, "epoch": 0.35406583496167143, "flos": 26066852098560.0, "grad_norm": 2.565955302156471, "language_loss": 0.7759434, "learning_rate": 2.8855232374915528e-06, "loss": 0.79703766, "num_input_tokens_seen": 126667500, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.51171875, "step": 5889, "time_per_iteration": 2.4281210899353027 }, { "auxiliary_loss_clip": 0.01076147, "auxiliary_loss_mlp": 0.0103659, "balance_loss_clip": 1.02120578, "balance_loss_mlp": 1.0248611, "epoch": 0.3541259582143394, "flos": 19791423613440.0, "grad_norm": 1.6545030615509662, "language_loss": 0.80782312, "learning_rate": 2.885184451321087e-06, "loss": 0.82895052, "num_input_tokens_seen": 126686820, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.51171875, "step": 5890, "time_per_iteration": 2.372559070587158 }, { "auxiliary_loss_clip": 0.01071039, "auxiliary_loss_mlp": 0.01028752, "balance_loss_clip": 1.01510835, "balance_loss_mlp": 1.02188134, "epoch": 0.35418608146700736, "flos": 24021011715840.0, "grad_norm": 1.6483791044769702, "language_loss": 0.7966547, "learning_rate": 2.884845633560664e-06, "loss": 0.81765258, "num_input_tokens_seen": 126706965, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.4921875, "step": 5891, "time_per_iteration": 2.439776659011841 }, { "auxiliary_loss_clip": 0.01075726, "auxiliary_loss_mlp": 0.01034747, "balance_loss_clip": 1.01836753, "balance_loss_mlp": 1.02430964, "epoch": 0.35424620471967533, "flos": 12384948856320.0, "grad_norm": 1.7290875428508632, "language_loss": 0.72937632, "learning_rate": 2.8845067842223776e-06, "loss": 0.75048107, "num_input_tokens_seen": 126724015, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.515625, "step": 5892, "time_per_iteration": 2.3800930976867676 }, { "auxiliary_loss_clip": 0.01075378, "auxiliary_loss_mlp": 0.01036513, "balance_loss_clip": 1.02059865, "balance_loss_mlp": 1.02440941, "epoch": 0.3543063279723433, "flos": 19672859036160.0, "grad_norm": 2.1595526988730733, "language_loss": 0.6732682, "learning_rate": 2.884167903318319e-06, "loss": 0.69438702, "num_input_tokens_seen": 126737565, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.5078125, "step": 5893, "time_per_iteration": 2.4048843383789062 }, { "auxiliary_loss_clip": 0.01073617, "auxiliary_loss_mlp": 0.01032349, "balance_loss_clip": 1.01644659, "balance_loss_mlp": 1.02289867, "epoch": 0.35436645122501126, "flos": 21908102878080.0, "grad_norm": 1.7349576200799974, "language_loss": 0.6976167, "learning_rate": 2.8838289908605822e-06, "loss": 0.71867639, "num_input_tokens_seen": 126756095, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.5078125, "step": 5894, "time_per_iteration": 2.385986804962158 }, { "auxiliary_loss_clip": 0.0107549, "auxiliary_loss_mlp": 0.01027314, "balance_loss_clip": 1.01326466, "balance_loss_mlp": 1.02557349, "epoch": 0.3544265744776792, "flos": 21718629596160.0, "grad_norm": 2.645961968340466, "language_loss": 0.74912483, "learning_rate": 2.8834900468612624e-06, "loss": 0.77015287, "num_input_tokens_seen": 126775455, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.5, "step": 5895, "time_per_iteration": 2.4591476917266846 }, { "auxiliary_loss_clip": 0.01074106, "auxiliary_loss_mlp": 0.01031558, "balance_loss_clip": 1.01627517, "balance_loss_mlp": 1.02289915, "epoch": 0.3544866977303472, "flos": 21212214192000.0, "grad_norm": 1.9440931471312937, "language_loss": 0.8345443, "learning_rate": 2.883151071332455e-06, "loss": 0.85560095, "num_input_tokens_seen": 126792320, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.51171875, "step": 5896, "time_per_iteration": 2.3896162509918213 }, { "auxiliary_loss_clip": 0.01075096, "auxiliary_loss_mlp": 0.01040164, "balance_loss_clip": 1.02389169, "balance_loss_mlp": 1.0239203, "epoch": 0.35454682098301515, "flos": 29310213715200.0, "grad_norm": 1.6298269937323695, "language_loss": 0.69902974, "learning_rate": 2.8828120642862585e-06, "loss": 0.72018236, "num_input_tokens_seen": 126813680, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.51171875, "step": 5897, "time_per_iteration": 2.477522134780884 }, { "auxiliary_loss_clip": 0.01074205, "auxiliary_loss_mlp": 0.010336, "balance_loss_clip": 1.01820421, "balance_loss_mlp": 1.02349758, "epoch": 0.3546069442356832, "flos": 24315434144640.0, "grad_norm": 1.499286734717137, "language_loss": 0.81830782, "learning_rate": 2.882473025734769e-06, "loss": 0.83938587, "num_input_tokens_seen": 126834395, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.5078125, "step": 5898, "time_per_iteration": 2.408853769302368 }, { "auxiliary_loss_clip": 0.0107103, "auxiliary_loss_mlp": 0.01032759, "balance_loss_clip": 1.01895452, "balance_loss_mlp": 1.02268291, "epoch": 0.35466706748835114, "flos": 22856169381120.0, "grad_norm": 1.4404394282424455, "language_loss": 0.74296194, "learning_rate": 2.8821339556900883e-06, "loss": 0.76399988, "num_input_tokens_seen": 126855145, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.48242188, "step": 5899, "time_per_iteration": 2.4350152015686035 }, { "auxiliary_loss_clip": 0.01074276, "auxiliary_loss_mlp": 0.01030871, "balance_loss_clip": 1.01613092, "balance_loss_mlp": 1.02306354, "epoch": 0.3547271907410191, "flos": 28328839908480.0, "grad_norm": 2.139747911416434, "language_loss": 0.79579532, "learning_rate": 2.8817948541643153e-06, "loss": 0.81684673, "num_input_tokens_seen": 126873790, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.51171875, "step": 5900, "time_per_iteration": 2.4274556636810303 }, { "auxiliary_loss_clip": 0.01075679, "auxiliary_loss_mlp": 0.01031729, "balance_loss_clip": 1.01562381, "balance_loss_mlp": 1.02413762, "epoch": 0.35478731399368707, "flos": 23512955477760.0, "grad_norm": 1.8622299784497105, "language_loss": 0.81282228, "learning_rate": 2.8814557211695523e-06, "loss": 0.8338964, "num_input_tokens_seen": 126892865, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.515625, "step": 5901, "time_per_iteration": 2.447465658187866 }, { "auxiliary_loss_clip": 0.01075801, "auxiliary_loss_mlp": 0.01028772, "balance_loss_clip": 1.01333487, "balance_loss_mlp": 1.02326739, "epoch": 0.35484743724635504, "flos": 18623334522240.0, "grad_norm": 1.7799090708897565, "language_loss": 0.757442, "learning_rate": 2.8811165567179025e-06, "loss": 0.7784878, "num_input_tokens_seen": 126911935, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.5234375, "step": 5902, "time_per_iteration": 2.3875577449798584 }, { "auxiliary_loss_clip": 0.01074364, "auxiliary_loss_mlp": 0.01032203, "balance_loss_clip": 1.01726556, "balance_loss_mlp": 1.02345991, "epoch": 0.354907560499023, "flos": 17383533765120.0, "grad_norm": 3.485506888533945, "language_loss": 0.70700645, "learning_rate": 2.880777360821468e-06, "loss": 0.72807217, "num_input_tokens_seen": 126930040, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.5078125, "step": 5903, "time_per_iteration": 3.7309510707855225 }, { "auxiliary_loss_clip": 0.01076166, "auxiliary_loss_mlp": 0.01032293, "balance_loss_clip": 1.01654553, "balance_loss_mlp": 1.02330661, "epoch": 0.35496768375169097, "flos": 19207536168960.0, "grad_norm": 2.8247905819090935, "language_loss": 0.74162674, "learning_rate": 2.8804381334923563e-06, "loss": 0.76271129, "num_input_tokens_seen": 126948390, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.52734375, "step": 5904, "time_per_iteration": 2.3524158000946045 }, { "auxiliary_loss_clip": 0.01077683, "auxiliary_loss_mlp": 0.01030771, "balance_loss_clip": 1.01529133, "balance_loss_mlp": 1.02583325, "epoch": 0.35502780700435893, "flos": 18331809736320.0, "grad_norm": 8.969292883330912, "language_loss": 0.79157579, "learning_rate": 2.8800988747426722e-06, "loss": 0.81266034, "num_input_tokens_seen": 126964905, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.51953125, "step": 5905, "time_per_iteration": 2.3464064598083496 }, { "auxiliary_loss_clip": 0.01067803, "auxiliary_loss_mlp": 0.01029924, "balance_loss_clip": 1.01639962, "balance_loss_mlp": 1.02163506, "epoch": 0.3550879302570269, "flos": 15447704676480.0, "grad_norm": 1.8276164085040287, "language_loss": 0.72286129, "learning_rate": 2.8797595845845225e-06, "loss": 0.74383855, "num_input_tokens_seen": 126982000, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.4609375, "step": 5906, "time_per_iteration": 3.7140448093414307 }, { "auxiliary_loss_clip": 0.01077125, "auxiliary_loss_mlp": 0.01028239, "balance_loss_clip": 1.01140714, "balance_loss_mlp": 1.02378869, "epoch": 0.35514805350969486, "flos": 21978173710080.0, "grad_norm": 1.9258495636842574, "language_loss": 0.74568594, "learning_rate": 2.879420263030017e-06, "loss": 0.76673961, "num_input_tokens_seen": 126998390, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.53515625, "step": 5907, "time_per_iteration": 3.828256607055664 }, { "auxiliary_loss_clip": 0.01074815, "auxiliary_loss_mlp": 0.01029877, "balance_loss_clip": 1.01446307, "balance_loss_mlp": 1.02332067, "epoch": 0.3552081767623628, "flos": 29860654210560.0, "grad_norm": 1.6433169390756894, "language_loss": 0.75686789, "learning_rate": 2.8790809100912637e-06, "loss": 0.77791482, "num_input_tokens_seen": 127020220, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.515625, "step": 5908, "time_per_iteration": 2.4670584201812744 }, { "auxiliary_loss_clip": 0.01075054, "auxiliary_loss_mlp": 0.01028911, "balance_loss_clip": 1.01433158, "balance_loss_mlp": 1.02412367, "epoch": 0.3552683000150308, "flos": 26431066068480.0, "grad_norm": 1.8866977599410624, "language_loss": 0.68300748, "learning_rate": 2.8787415257803742e-06, "loss": 0.7040472, "num_input_tokens_seen": 127038585, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.5078125, "step": 5909, "time_per_iteration": 2.509817123413086 }, { "auxiliary_loss_clip": 0.01071519, "auxiliary_loss_mlp": 0.01028497, "balance_loss_clip": 1.01430488, "balance_loss_mlp": 1.02333391, "epoch": 0.35532842326769876, "flos": 19785139568640.0, "grad_norm": 1.7310796374024051, "language_loss": 0.78199911, "learning_rate": 2.8784021101094605e-06, "loss": 0.80299926, "num_input_tokens_seen": 127056215, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.48242188, "step": 5910, "time_per_iteration": 2.426288604736328 }, { "auxiliary_loss_clip": 0.01076215, "auxiliary_loss_mlp": 0.01028293, "balance_loss_clip": 1.01181817, "balance_loss_mlp": 1.02370858, "epoch": 0.3553885465203668, "flos": 17238295042560.0, "grad_norm": 1.7470070007300544, "language_loss": 0.71116287, "learning_rate": 2.878062663090635e-06, "loss": 0.73220789, "num_input_tokens_seen": 127075825, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.5234375, "step": 5911, "time_per_iteration": 3.816206693649292 }, { "auxiliary_loss_clip": 0.01071634, "auxiliary_loss_mlp": 0.01026735, "balance_loss_clip": 1.01283526, "balance_loss_mlp": 1.02319586, "epoch": 0.35544866977303474, "flos": 14933608773120.0, "grad_norm": 2.4276127251523456, "language_loss": 0.86980754, "learning_rate": 2.8777231847360117e-06, "loss": 0.89079118, "num_input_tokens_seen": 127091205, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.484375, "step": 5912, "time_per_iteration": 2.385530710220337 }, { "auxiliary_loss_clip": 0.01072557, "auxiliary_loss_mlp": 0.01026646, "balance_loss_clip": 1.01207829, "balance_loss_mlp": 1.0229423, "epoch": 0.3555087930257027, "flos": 19755009198720.0, "grad_norm": 2.004730984511101, "language_loss": 0.76809984, "learning_rate": 2.8773836750577053e-06, "loss": 0.78909194, "num_input_tokens_seen": 127109210, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.49609375, "step": 5913, "time_per_iteration": 2.398463726043701 }, { "auxiliary_loss_clip": 0.01072708, "auxiliary_loss_mlp": 0.01028703, "balance_loss_clip": 1.01374865, "balance_loss_mlp": 1.02411163, "epoch": 0.3555689162783707, "flos": 21067219848960.0, "grad_norm": 1.2736134280555174, "language_loss": 0.82607269, "learning_rate": 2.877044134067833e-06, "loss": 0.84708679, "num_input_tokens_seen": 127128400, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.484375, "step": 5914, "time_per_iteration": 2.3949437141418457 }, { "auxiliary_loss_clip": 0.01072856, "auxiliary_loss_mlp": 0.01027922, "balance_loss_clip": 1.01286614, "balance_loss_mlp": 1.02342296, "epoch": 0.35562903953103864, "flos": 33068334551040.0, "grad_norm": 2.0536885243897727, "language_loss": 0.70349467, "learning_rate": 2.8767045617785108e-06, "loss": 0.72450244, "num_input_tokens_seen": 127149965, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.49609375, "step": 5915, "time_per_iteration": 2.499976396560669 }, { "auxiliary_loss_clip": 0.01070764, "auxiliary_loss_mlp": 0.01030656, "balance_loss_clip": 1.01661289, "balance_loss_mlp": 1.02145088, "epoch": 0.3556891627837066, "flos": 20556824549760.0, "grad_norm": 1.7392553373605206, "language_loss": 0.76000738, "learning_rate": 2.8763649582018584e-06, "loss": 0.78102154, "num_input_tokens_seen": 127169865, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.49414062, "step": 5916, "time_per_iteration": 2.4060258865356445 }, { "auxiliary_loss_clip": 0.01075412, "auxiliary_loss_mlp": 0.01033269, "balance_loss_clip": 1.01874971, "balance_loss_mlp": 1.02454448, "epoch": 0.35574928603637457, "flos": 20702307651840.0, "grad_norm": 1.5779878568317227, "language_loss": 0.88140929, "learning_rate": 2.876025323349995e-06, "loss": 0.90249616, "num_input_tokens_seen": 127188075, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.5078125, "step": 5917, "time_per_iteration": 2.3921902179718018 }, { "auxiliary_loss_clip": 0.01072951, "auxiliary_loss_mlp": 0.01025504, "balance_loss_clip": 1.01175272, "balance_loss_mlp": 1.02395844, "epoch": 0.35580940928904253, "flos": 15193711468800.0, "grad_norm": 1.9180134136793532, "language_loss": 0.74765903, "learning_rate": 2.875685657235041e-06, "loss": 0.76864356, "num_input_tokens_seen": 127206065, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.48828125, "step": 5918, "time_per_iteration": 2.412428379058838 }, { "auxiliary_loss_clip": 0.01012443, "auxiliary_loss_mlp": 0.0100438, "balance_loss_clip": 1.00324166, "balance_loss_mlp": 1.00213659, "epoch": 0.3558695325417105, "flos": 58636312099200.0, "grad_norm": 0.9166926561251532, "language_loss": 0.63805127, "learning_rate": 2.8753459598691183e-06, "loss": 0.65821946, "num_input_tokens_seen": 127257885, "router_z_loss_clip": 0.01141357, "router_z_loss_mlp": 0.10302734, "step": 5919, "time_per_iteration": 2.80796217918396 }, { "auxiliary_loss_clip": 0.01075172, "auxiliary_loss_mlp": 0.01031999, "balance_loss_clip": 1.01707435, "balance_loss_mlp": 1.02332199, "epoch": 0.35592965579437846, "flos": 22017136654080.0, "grad_norm": 2.2192382856850954, "language_loss": 0.73782456, "learning_rate": 2.8750062312643495e-06, "loss": 0.75889635, "num_input_tokens_seen": 127275550, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.51953125, "step": 5920, "time_per_iteration": 2.43385910987854 }, { "auxiliary_loss_clip": 0.01070828, "auxiliary_loss_mlp": 0.01027108, "balance_loss_clip": 1.01217747, "balance_loss_mlp": 1.02149773, "epoch": 0.35598977904704643, "flos": 23366564680320.0, "grad_norm": 1.7025904234366431, "language_loss": 0.7757051, "learning_rate": 2.8746664714328603e-06, "loss": 0.7966845, "num_input_tokens_seen": 127295110, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.4921875, "step": 5921, "time_per_iteration": 2.422354221343994 }, { "auxiliary_loss_clip": 0.01072123, "auxiliary_loss_mlp": 0.01028261, "balance_loss_clip": 1.01437283, "balance_loss_mlp": 1.02315259, "epoch": 0.3560499022997144, "flos": 17784371617920.0, "grad_norm": 2.1851720961550476, "language_loss": 0.67276013, "learning_rate": 2.8743266803867743e-06, "loss": 0.69376391, "num_input_tokens_seen": 127312865, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.49023438, "step": 5922, "time_per_iteration": 2.4101037979125977 }, { "auxiliary_loss_clip": 0.0107533, "auxiliary_loss_mlp": 0.01034294, "balance_loss_clip": 1.01991677, "balance_loss_mlp": 1.02396476, "epoch": 0.35611002555238236, "flos": 20739420293760.0, "grad_norm": 1.9506992526186122, "language_loss": 0.78942466, "learning_rate": 2.8739868581382175e-06, "loss": 0.81052095, "num_input_tokens_seen": 127331710, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.515625, "step": 5923, "time_per_iteration": 2.4214835166931152 }, { "auxiliary_loss_clip": 0.01075011, "auxiliary_loss_mlp": 0.0103228, "balance_loss_clip": 1.01836824, "balance_loss_mlp": 1.02488649, "epoch": 0.3561701488050504, "flos": 19461250085760.0, "grad_norm": 1.8827073097389189, "language_loss": 0.85266215, "learning_rate": 2.873647004699318e-06, "loss": 0.87373507, "num_input_tokens_seen": 127350950, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.5, "step": 5924, "time_per_iteration": 2.3856821060180664 }, { "auxiliary_loss_clip": 0.01072474, "auxiliary_loss_mlp": 0.01029402, "balance_loss_clip": 1.01472116, "balance_loss_mlp": 1.02325845, "epoch": 0.35623027205771834, "flos": 30773598019200.0, "grad_norm": 1.8864423090769173, "language_loss": 0.77786255, "learning_rate": 2.8733071200822046e-06, "loss": 0.79888129, "num_input_tokens_seen": 127369385, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.4921875, "step": 5925, "time_per_iteration": 2.446983814239502 }, { "auxiliary_loss_clip": 0.01070506, "auxiliary_loss_mlp": 0.01031533, "balance_loss_clip": 1.01706064, "balance_loss_mlp": 1.02089548, "epoch": 0.3562903953103863, "flos": 16980182294400.0, "grad_norm": 1.9084098048431093, "language_loss": 0.75571799, "learning_rate": 2.8729672042990068e-06, "loss": 0.77673841, "num_input_tokens_seen": 127386965, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.49609375, "step": 5926, "time_per_iteration": 2.363508462905884 }, { "auxiliary_loss_clip": 0.01075721, "auxiliary_loss_mlp": 0.01027054, "balance_loss_clip": 1.01245046, "balance_loss_mlp": 1.02421558, "epoch": 0.3563505185630543, "flos": 23838765085440.0, "grad_norm": 2.025626013025771, "language_loss": 0.69512618, "learning_rate": 2.872627257361855e-06, "loss": 0.71615392, "num_input_tokens_seen": 127406075, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.515625, "step": 5927, "time_per_iteration": 2.383791923522949 }, { "auxiliary_loss_clip": 0.01069768, "auxiliary_loss_mlp": 0.01028729, "balance_loss_clip": 1.01556206, "balance_loss_mlp": 1.02203751, "epoch": 0.35641064181572224, "flos": 22272351759360.0, "grad_norm": 1.977131925250172, "language_loss": 0.79609823, "learning_rate": 2.8722872792828803e-06, "loss": 0.81708324, "num_input_tokens_seen": 127425350, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.47851562, "step": 5928, "time_per_iteration": 2.395374298095703 }, { "auxiliary_loss_clip": 0.01070851, "auxiliary_loss_mlp": 0.010285, "balance_loss_clip": 1.01433802, "balance_loss_mlp": 1.02258492, "epoch": 0.3564707650683902, "flos": 23000186206080.0, "grad_norm": 1.36971100217665, "language_loss": 0.81821471, "learning_rate": 2.8719472700742167e-06, "loss": 0.83920825, "num_input_tokens_seen": 127446335, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.48242188, "step": 5929, "time_per_iteration": 2.419863700866699 }, { "auxiliary_loss_clip": 0.01068277, "auxiliary_loss_mlp": 0.01024995, "balance_loss_clip": 1.01179874, "balance_loss_mlp": 1.02052593, "epoch": 0.35653088832105817, "flos": 14683385992320.0, "grad_norm": 1.596973328497599, "language_loss": 0.70001251, "learning_rate": 2.871607229747998e-06, "loss": 0.72094524, "num_input_tokens_seen": 127462795, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.4765625, "step": 5930, "time_per_iteration": 2.342053174972534 }, { "auxiliary_loss_clip": 0.01075997, "auxiliary_loss_mlp": 0.0102953, "balance_loss_clip": 1.015064, "balance_loss_mlp": 1.02533948, "epoch": 0.35659101157372614, "flos": 23475947569920.0, "grad_norm": 1.8892181752957757, "language_loss": 0.67771393, "learning_rate": 2.8712671583163596e-06, "loss": 0.69876921, "num_input_tokens_seen": 127482675, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.5078125, "step": 5931, "time_per_iteration": 2.424328088760376 }, { "auxiliary_loss_clip": 0.01070913, "auxiliary_loss_mlp": 0.01031929, "balance_loss_clip": 1.01791549, "balance_loss_mlp": 1.0222764, "epoch": 0.3566511348263941, "flos": 26577456865920.0, "grad_norm": 1.715579462192903, "language_loss": 0.6755209, "learning_rate": 2.870927055791437e-06, "loss": 0.6965493, "num_input_tokens_seen": 127502275, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.48828125, "step": 5932, "time_per_iteration": 2.4113881587982178 }, { "auxiliary_loss_clip": 0.01069909, "auxiliary_loss_mlp": 0.01024306, "balance_loss_clip": 1.01120543, "balance_loss_mlp": 1.02280855, "epoch": 0.35671125807906207, "flos": 13114179757440.0, "grad_norm": 2.152898948611348, "language_loss": 0.78931725, "learning_rate": 2.8705869221853684e-06, "loss": 0.81025946, "num_input_tokens_seen": 127520195, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.47070312, "step": 5933, "time_per_iteration": 2.379514217376709 }, { "auxiliary_loss_clip": 0.01070714, "auxiliary_loss_mlp": 0.01035387, "balance_loss_clip": 1.02138042, "balance_loss_mlp": 1.02115858, "epoch": 0.35677138133173003, "flos": 32999171414400.0, "grad_norm": 1.481716828316316, "language_loss": 0.69572234, "learning_rate": 2.8702467575102914e-06, "loss": 0.71678329, "num_input_tokens_seen": 127544495, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.49609375, "step": 5934, "time_per_iteration": 2.4865353107452393 }, { "auxiliary_loss_clip": 0.0107718, "auxiliary_loss_mlp": 0.01036748, "balance_loss_clip": 1.0198679, "balance_loss_mlp": 1.02353334, "epoch": 0.356831504584398, "flos": 20776777315200.0, "grad_norm": 1.6308150932066683, "language_loss": 0.70757735, "learning_rate": 2.869906561778347e-06, "loss": 0.72871661, "num_input_tokens_seen": 127563810, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.5390625, "step": 5935, "time_per_iteration": 2.417588472366333 }, { "auxiliary_loss_clip": 0.01071965, "auxiliary_loss_mlp": 0.0102967, "balance_loss_clip": 1.01504302, "balance_loss_mlp": 1.02198422, "epoch": 0.35689162783706596, "flos": 12164786622720.0, "grad_norm": 2.6017334537563785, "language_loss": 0.78478062, "learning_rate": 2.869566335001674e-06, "loss": 0.80579704, "num_input_tokens_seen": 127579065, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.5, "step": 5936, "time_per_iteration": 2.3403899669647217 }, { "auxiliary_loss_clip": 0.01070823, "auxiliary_loss_mlp": 0.01033008, "balance_loss_clip": 1.01793408, "balance_loss_mlp": 1.0215863, "epoch": 0.356951751089734, "flos": 23840371008000.0, "grad_norm": 1.360430489831818, "language_loss": 0.64434779, "learning_rate": 2.8692260771924167e-06, "loss": 0.66538608, "num_input_tokens_seen": 127599105, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.4921875, "step": 5937, "time_per_iteration": 2.4048120975494385 }, { "auxiliary_loss_clip": 0.01074406, "auxiliary_loss_mlp": 0.01026844, "balance_loss_clip": 1.01247382, "balance_loss_mlp": 1.02333021, "epoch": 0.35701187434240195, "flos": 11721564512640.0, "grad_norm": 2.338908034518505, "language_loss": 0.7841239, "learning_rate": 2.868885788362715e-06, "loss": 0.80513638, "num_input_tokens_seen": 127614940, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.5078125, "step": 5938, "time_per_iteration": 2.3586747646331787 }, { "auxiliary_loss_clip": 0.01073713, "auxiliary_loss_mlp": 0.01033914, "balance_loss_clip": 1.01928055, "balance_loss_mlp": 1.02315617, "epoch": 0.3570719975950699, "flos": 24897750577920.0, "grad_norm": 1.504809101025306, "language_loss": 0.8034789, "learning_rate": 2.868545468524716e-06, "loss": 0.82455516, "num_input_tokens_seen": 127634960, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.50390625, "step": 5939, "time_per_iteration": 2.4168505668640137 }, { "auxiliary_loss_clip": 0.01073777, "auxiliary_loss_mlp": 0.01028514, "balance_loss_clip": 1.01325536, "balance_loss_mlp": 1.02135611, "epoch": 0.3571321208477379, "flos": 25993639244160.0, "grad_norm": 1.79766284266332, "language_loss": 0.79158193, "learning_rate": 2.8682051176905624e-06, "loss": 0.81260484, "num_input_tokens_seen": 127654545, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.5234375, "step": 5940, "time_per_iteration": 2.4209342002868652 }, { "auxiliary_loss_clip": 0.01073551, "auxiliary_loss_mlp": 0.01028896, "balance_loss_clip": 1.01330376, "balance_loss_mlp": 1.02250683, "epoch": 0.35719224410040584, "flos": 14500790248320.0, "grad_norm": 1.8942428759598329, "language_loss": 0.71959144, "learning_rate": 2.867864735872402e-06, "loss": 0.7406159, "num_input_tokens_seen": 127672320, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.5078125, "step": 5941, "time_per_iteration": 2.4017865657806396 }, { "auxiliary_loss_clip": 0.01074913, "auxiliary_loss_mlp": 0.01031032, "balance_loss_clip": 1.01592779, "balance_loss_mlp": 1.02432132, "epoch": 0.3572523673530738, "flos": 31174121669760.0, "grad_norm": 2.034368847560607, "language_loss": 0.63886237, "learning_rate": 2.8675243230823815e-06, "loss": 0.65992182, "num_input_tokens_seen": 127693315, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.5078125, "step": 5942, "time_per_iteration": 4.0378193855285645 }, { "auxiliary_loss_clip": 0.01071789, "auxiliary_loss_mlp": 0.01036464, "balance_loss_clip": 1.02050829, "balance_loss_mlp": 1.02242017, "epoch": 0.3573124906057418, "flos": 15851056147200.0, "grad_norm": 1.8992011189116973, "language_loss": 0.73817796, "learning_rate": 2.86718387933265e-06, "loss": 0.75926054, "num_input_tokens_seen": 127711570, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.49414062, "step": 5943, "time_per_iteration": 2.3711822032928467 }, { "auxiliary_loss_clip": 0.01013196, "auxiliary_loss_mlp": 0.01000807, "balance_loss_clip": 0.99947822, "balance_loss_mlp": 1.00293803, "epoch": 0.35737261385840974, "flos": 60819989996160.0, "grad_norm": 0.8011237422544815, "language_loss": 0.6077981, "learning_rate": 2.8668434046353557e-06, "loss": 0.62793815, "num_input_tokens_seen": 127772475, "router_z_loss_clip": 0.01330566, "router_z_loss_mlp": 0.10253906, "step": 5944, "time_per_iteration": 3.124643087387085 }, { "auxiliary_loss_clip": 0.01069537, "auxiliary_loss_mlp": 0.01025334, "balance_loss_clip": 1.01138711, "balance_loss_mlp": 1.02124238, "epoch": 0.3574327371110777, "flos": 18842763617280.0, "grad_norm": 1.719653871564029, "language_loss": 0.72688848, "learning_rate": 2.86650289900265e-06, "loss": 0.74783719, "num_input_tokens_seen": 127790940, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.48242188, "step": 5945, "time_per_iteration": 3.7560200691223145 }, { "auxiliary_loss_clip": 0.01071227, "auxiliary_loss_mlp": 0.01030704, "balance_loss_clip": 1.01599336, "balance_loss_mlp": 1.02127099, "epoch": 0.35749286036374567, "flos": 23548566931200.0, "grad_norm": 1.6907981232767915, "language_loss": 0.80688787, "learning_rate": 2.8661623624466856e-06, "loss": 0.8279072, "num_input_tokens_seen": 127808275, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.5, "step": 5946, "time_per_iteration": 3.8214290142059326 }, { "auxiliary_loss_clip": 0.01074525, "auxiliary_loss_mlp": 0.01037912, "balance_loss_clip": 1.0230881, "balance_loss_mlp": 1.0244472, "epoch": 0.35755298361641363, "flos": 21104437224960.0, "grad_norm": 1.3280999474170685, "language_loss": 0.68914711, "learning_rate": 2.8658217949796133e-06, "loss": 0.71027148, "num_input_tokens_seen": 127828840, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.5, "step": 5947, "time_per_iteration": 2.4186766147613525 }, { "auxiliary_loss_clip": 0.01069792, "auxiliary_loss_mlp": 0.01036792, "balance_loss_clip": 1.02277255, "balance_loss_mlp": 1.02240419, "epoch": 0.3576131068690816, "flos": 19244020406400.0, "grad_norm": 1.6894771416948158, "language_loss": 0.7563026, "learning_rate": 2.8654811966135893e-06, "loss": 0.77736843, "num_input_tokens_seen": 127846240, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.47460938, "step": 5948, "time_per_iteration": 2.3672938346862793 }, { "auxiliary_loss_clip": 0.01069503, "auxiliary_loss_mlp": 0.01033781, "balance_loss_clip": 1.01949406, "balance_loss_mlp": 1.02019787, "epoch": 0.35767323012174956, "flos": 28653532352640.0, "grad_norm": 5.932336696304652, "language_loss": 0.70936704, "learning_rate": 2.865140567360767e-06, "loss": 0.73039985, "num_input_tokens_seen": 127866880, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.4921875, "step": 5949, "time_per_iteration": 2.471717596054077 }, { "auxiliary_loss_clip": 0.01072815, "auxiliary_loss_mlp": 0.01035241, "balance_loss_clip": 1.02138329, "balance_loss_mlp": 1.02277851, "epoch": 0.35773335337441753, "flos": 17084607770880.0, "grad_norm": 1.8392819708553219, "language_loss": 0.77234703, "learning_rate": 2.864799907233304e-06, "loss": 0.79342759, "num_input_tokens_seen": 127883560, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.5, "step": 5950, "time_per_iteration": 3.8080577850341797 }, { "auxiliary_loss_clip": 0.01072638, "auxiliary_loss_mlp": 0.01028006, "balance_loss_clip": 1.01333153, "balance_loss_mlp": 1.02253437, "epoch": 0.35779347662708555, "flos": 15887680030080.0, "grad_norm": 1.6855518394982005, "language_loss": 0.73074478, "learning_rate": 2.8644592162433565e-06, "loss": 0.75175124, "num_input_tokens_seen": 127902330, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.5, "step": 5951, "time_per_iteration": 2.385679006576538 }, { "auxiliary_loss_clip": 0.01075451, "auxiliary_loss_mlp": 0.0102857, "balance_loss_clip": 1.01308489, "balance_loss_mlp": 1.02243567, "epoch": 0.3578535998797535, "flos": 28657547159040.0, "grad_norm": 2.0627297901646666, "language_loss": 0.70312607, "learning_rate": 2.864118494403083e-06, "loss": 0.72416627, "num_input_tokens_seen": 127922325, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.53125, "step": 5952, "time_per_iteration": 2.4737606048583984 }, { "auxiliary_loss_clip": 0.01070435, "auxiliary_loss_mlp": 0.01028892, "balance_loss_clip": 1.01501608, "balance_loss_mlp": 1.02144802, "epoch": 0.3579137231324215, "flos": 37850911678080.0, "grad_norm": 1.7315607420422052, "language_loss": 0.6982621, "learning_rate": 2.863777741724643e-06, "loss": 0.71925539, "num_input_tokens_seen": 127942635, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.48828125, "step": 5953, "time_per_iteration": 2.5504329204559326 }, { "auxiliary_loss_clip": 0.01070049, "auxiliary_loss_mlp": 0.01027952, "balance_loss_clip": 1.01369417, "balance_loss_mlp": 1.02118003, "epoch": 0.35797384638508944, "flos": 22345739170560.0, "grad_norm": 1.522911635126259, "language_loss": 0.66758895, "learning_rate": 2.863436958220198e-06, "loss": 0.68856895, "num_input_tokens_seen": 127962520, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.48828125, "step": 5954, "time_per_iteration": 2.4199447631835938 }, { "auxiliary_loss_clip": 0.0107244, "auxiliary_loss_mlp": 0.01030019, "balance_loss_clip": 1.01595175, "balance_loss_mlp": 1.02268529, "epoch": 0.3580339696377574, "flos": 13588858869120.0, "grad_norm": 1.9268731718788137, "language_loss": 0.74515939, "learning_rate": 2.8630961439019087e-06, "loss": 0.76618397, "num_input_tokens_seen": 127981180, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.49609375, "step": 5955, "time_per_iteration": 2.4272725582122803 }, { "auxiliary_loss_clip": 0.01068141, "auxiliary_loss_mlp": 0.01022692, "balance_loss_clip": 1.00941253, "balance_loss_mlp": 1.02124548, "epoch": 0.3580940928904254, "flos": 23767123242240.0, "grad_norm": 1.6735730058771812, "language_loss": 0.76419848, "learning_rate": 2.8627552987819382e-06, "loss": 0.78510684, "num_input_tokens_seen": 127999725, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.46875, "step": 5956, "time_per_iteration": 2.405097484588623 }, { "auxiliary_loss_clip": 0.01069308, "auxiliary_loss_mlp": 0.01026953, "balance_loss_clip": 1.01375628, "balance_loss_mlp": 1.02230978, "epoch": 0.35815421614309334, "flos": 19462856008320.0, "grad_norm": 1.5454168655842875, "language_loss": 0.73009193, "learning_rate": 2.86241442287245e-06, "loss": 0.75105453, "num_input_tokens_seen": 128018885, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.47070312, "step": 5957, "time_per_iteration": 2.3796825408935547 }, { "auxiliary_loss_clip": 0.01071461, "auxiliary_loss_mlp": 0.0102929, "balance_loss_clip": 1.0146687, "balance_loss_mlp": 1.02231061, "epoch": 0.3582143393957613, "flos": 23367053439360.0, "grad_norm": 1.7028257216885643, "language_loss": 0.70873713, "learning_rate": 2.86207351618561e-06, "loss": 0.72974467, "num_input_tokens_seen": 128037875, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.4921875, "step": 5958, "time_per_iteration": 2.3916237354278564 }, { "auxiliary_loss_clip": 0.01069399, "auxiliary_loss_mlp": 0.01026143, "balance_loss_clip": 1.01275599, "balance_loss_mlp": 1.02178752, "epoch": 0.35827446264842927, "flos": 26322067203840.0, "grad_norm": 1.5966503158089345, "language_loss": 0.8835175, "learning_rate": 2.8617325787335833e-06, "loss": 0.90447289, "num_input_tokens_seen": 128056045, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.4765625, "step": 5959, "time_per_iteration": 2.4356231689453125 }, { "auxiliary_loss_clip": 0.01070174, "auxiliary_loss_mlp": 0.01033224, "balance_loss_clip": 1.01886547, "balance_loss_mlp": 1.02187371, "epoch": 0.35833458590109724, "flos": 30445274793600.0, "grad_norm": 1.659003537861445, "language_loss": 0.58252156, "learning_rate": 2.861391610528538e-06, "loss": 0.60355556, "num_input_tokens_seen": 128077815, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.48242188, "step": 5960, "time_per_iteration": 2.448814630508423 }, { "auxiliary_loss_clip": 0.01071052, "auxiliary_loss_mlp": 0.01025945, "balance_loss_clip": 1.01110339, "balance_loss_mlp": 1.02143323, "epoch": 0.3583947091537652, "flos": 14829008739840.0, "grad_norm": 2.03019221311439, "language_loss": 0.76461655, "learning_rate": 2.8610506115826415e-06, "loss": 0.78558648, "num_input_tokens_seen": 128095460, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.49414062, "step": 5961, "time_per_iteration": 2.3817083835601807 }, { "auxiliary_loss_clip": 0.01072396, "auxiliary_loss_mlp": 0.01026293, "balance_loss_clip": 1.01171958, "balance_loss_mlp": 1.02307522, "epoch": 0.35845483240643317, "flos": 34239216551040.0, "grad_norm": 1.6862321619024208, "language_loss": 0.70352829, "learning_rate": 2.8607095819080633e-06, "loss": 0.7245152, "num_input_tokens_seen": 128118605, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.49414062, "step": 5962, "time_per_iteration": 2.4933602809906006 }, { "auxiliary_loss_clip": 0.01070173, "auxiliary_loss_mlp": 0.01028243, "balance_loss_clip": 1.01541662, "balance_loss_mlp": 1.02336693, "epoch": 0.35851495565910113, "flos": 20959023945600.0, "grad_norm": 1.6460268097452655, "language_loss": 0.74522746, "learning_rate": 2.8603685215169745e-06, "loss": 0.76621163, "num_input_tokens_seen": 128139205, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.46875, "step": 5963, "time_per_iteration": 2.4234609603881836 }, { "auxiliary_loss_clip": 0.01070613, "auxiliary_loss_mlp": 0.01029882, "balance_loss_clip": 1.01486123, "balance_loss_mlp": 1.0227766, "epoch": 0.35857507891176915, "flos": 22308766174080.0, "grad_norm": 1.5310674884993207, "language_loss": 0.78604966, "learning_rate": 2.8600274304215458e-06, "loss": 0.80705464, "num_input_tokens_seen": 128158765, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.47851562, "step": 5964, "time_per_iteration": 2.41050124168396 }, { "auxiliary_loss_clip": 0.01072615, "auxiliary_loss_mlp": 0.01027577, "balance_loss_clip": 1.01299191, "balance_loss_mlp": 1.02206707, "epoch": 0.3586352021644371, "flos": 23366739237120.0, "grad_norm": 2.025921451943204, "language_loss": 0.6644938, "learning_rate": 2.859686308633951e-06, "loss": 0.68549573, "num_input_tokens_seen": 128177850, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.50390625, "step": 5965, "time_per_iteration": 2.408123254776001 }, { "auxiliary_loss_clip": 0.01072317, "auxiliary_loss_mlp": 0.01026799, "balance_loss_clip": 1.01197517, "balance_loss_mlp": 1.02330947, "epoch": 0.3586953254171051, "flos": 27848156042880.0, "grad_norm": 1.5425050498290698, "language_loss": 0.79170668, "learning_rate": 2.8593451561663634e-06, "loss": 0.81269795, "num_input_tokens_seen": 128196925, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.49023438, "step": 5966, "time_per_iteration": 2.4352409839630127 }, { "auxiliary_loss_clip": 0.01070282, "auxiliary_loss_mlp": 0.01029939, "balance_loss_clip": 1.01453137, "balance_loss_mlp": 1.02103496, "epoch": 0.35875544866977305, "flos": 19499479891200.0, "grad_norm": 1.9073311257023076, "language_loss": 0.91068411, "learning_rate": 2.859003973030957e-06, "loss": 0.93168628, "num_input_tokens_seen": 128213955, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.4921875, "step": 5967, "time_per_iteration": 2.383220672607422 }, { "auxiliary_loss_clip": 0.01075283, "auxiliary_loss_mlp": 0.01033249, "balance_loss_clip": 1.01794267, "balance_loss_mlp": 1.02471089, "epoch": 0.358815571922441, "flos": 21470047649280.0, "grad_norm": 1.7376755701244664, "language_loss": 0.8022114, "learning_rate": 2.858662759239909e-06, "loss": 0.82329679, "num_input_tokens_seen": 128232980, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.5078125, "step": 5968, "time_per_iteration": 2.3881704807281494 }, { "auxiliary_loss_clip": 0.01076556, "auxiliary_loss_mlp": 0.01039407, "balance_loss_clip": 1.02344513, "balance_loss_mlp": 1.02425456, "epoch": 0.358875695175109, "flos": 21834331441920.0, "grad_norm": 2.1063316108064427, "language_loss": 0.84497571, "learning_rate": 2.858321514805395e-06, "loss": 0.86613536, "num_input_tokens_seen": 128252795, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.5234375, "step": 5969, "time_per_iteration": 2.417069673538208 }, { "auxiliary_loss_clip": 0.01070347, "auxiliary_loss_mlp": 0.01024977, "balance_loss_clip": 1.01148224, "balance_loss_mlp": 1.02203286, "epoch": 0.35893581842777694, "flos": 32010361488000.0, "grad_norm": 1.758826799255447, "language_loss": 0.72189152, "learning_rate": 2.8579802397395953e-06, "loss": 0.7428447, "num_input_tokens_seen": 128273115, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.484375, "step": 5970, "time_per_iteration": 2.47866153717041 }, { "auxiliary_loss_clip": 0.01071148, "auxiliary_loss_mlp": 0.01030514, "balance_loss_clip": 1.01675689, "balance_loss_mlp": 1.02226007, "epoch": 0.3589959416804449, "flos": 20484763770240.0, "grad_norm": 1.7765434929520485, "language_loss": 0.79491836, "learning_rate": 2.857638934054687e-06, "loss": 0.8159349, "num_input_tokens_seen": 128292220, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.48828125, "step": 5971, "time_per_iteration": 2.4473137855529785 }, { "auxiliary_loss_clip": 0.01071605, "auxiliary_loss_mlp": 0.01030863, "balance_loss_clip": 1.01579535, "balance_loss_mlp": 1.02054417, "epoch": 0.3590560649331129, "flos": 16179728486400.0, "grad_norm": 1.7992220947713973, "language_loss": 0.78177643, "learning_rate": 2.8572975977628517e-06, "loss": 0.80280107, "num_input_tokens_seen": 128310305, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.5078125, "step": 5972, "time_per_iteration": 2.359095573425293 }, { "auxiliary_loss_clip": 0.0107055, "auxiliary_loss_mlp": 0.01032108, "balance_loss_clip": 1.0173142, "balance_loss_mlp": 1.02126408, "epoch": 0.35911618818578084, "flos": 20374368451200.0, "grad_norm": 1.9280626901090425, "language_loss": 0.81194162, "learning_rate": 2.8569562308762697e-06, "loss": 0.83296818, "num_input_tokens_seen": 128328305, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.4921875, "step": 5973, "time_per_iteration": 2.3903608322143555 }, { "auxiliary_loss_clip": 0.01013822, "auxiliary_loss_mlp": 0.00999089, "balance_loss_clip": 0.99780113, "balance_loss_mlp": 1.00375342, "epoch": 0.3591763114384488, "flos": 41234308358400.0, "grad_norm": 0.9096813525259001, "language_loss": 0.5677613, "learning_rate": 2.8566148334071245e-06, "loss": 0.58789039, "num_input_tokens_seen": 128378380, "router_z_loss_clip": 0.01287842, "router_z_loss_mlp": 0.10058594, "step": 5974, "time_per_iteration": 2.891608238220215 }, { "auxiliary_loss_clip": 0.01071886, "auxiliary_loss_mlp": 0.01028706, "balance_loss_clip": 1.01536608, "balance_loss_mlp": 1.02229309, "epoch": 0.35923643469111677, "flos": 18694522517760.0, "grad_norm": 1.9743637413899624, "language_loss": 0.69251728, "learning_rate": 2.8562734053675997e-06, "loss": 0.71352315, "num_input_tokens_seen": 128394315, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.49609375, "step": 5975, "time_per_iteration": 2.4206223487854004 }, { "auxiliary_loss_clip": 0.01070207, "auxiliary_loss_mlp": 0.01030684, "balance_loss_clip": 1.01723135, "balance_loss_mlp": 1.02263165, "epoch": 0.35929655794378473, "flos": 25008774301440.0, "grad_norm": 1.7627566589357815, "language_loss": 0.79994309, "learning_rate": 2.8559319467698794e-06, "loss": 0.82095206, "num_input_tokens_seen": 128414515, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.47460938, "step": 5976, "time_per_iteration": 2.4443769454956055 }, { "auxiliary_loss_clip": 0.0107081, "auxiliary_loss_mlp": 0.01030335, "balance_loss_clip": 1.01508832, "balance_loss_mlp": 1.02138186, "epoch": 0.35935668119645275, "flos": 14974701310080.0, "grad_norm": 1.80127599447291, "language_loss": 0.7893914, "learning_rate": 2.855590457626149e-06, "loss": 0.81040287, "num_input_tokens_seen": 128430615, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.49414062, "step": 5977, "time_per_iteration": 2.3912503719329834 }, { "auxiliary_loss_clip": 0.01069086, "auxiliary_loss_mlp": 0.0103344, "balance_loss_clip": 1.02023149, "balance_loss_mlp": 1.02143776, "epoch": 0.3594168044491207, "flos": 21177091497600.0, "grad_norm": 2.660693871903989, "language_loss": 0.80078697, "learning_rate": 2.855248937948597e-06, "loss": 0.82181215, "num_input_tokens_seen": 128449480, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.4765625, "step": 5978, "time_per_iteration": 2.388336658477783 }, { "auxiliary_loss_clip": 0.0106953, "auxiliary_loss_mlp": 0.01024494, "balance_loss_clip": 1.01004541, "balance_loss_mlp": 1.02077723, "epoch": 0.3594769277017887, "flos": 27670936648320.0, "grad_norm": 1.809516799260894, "language_loss": 0.6769433, "learning_rate": 2.8549073877494096e-06, "loss": 0.69788361, "num_input_tokens_seen": 128471465, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.48828125, "step": 5979, "time_per_iteration": 2.4456100463867188 }, { "auxiliary_loss_clip": 0.0107007, "auxiliary_loss_mlp": 0.01025456, "balance_loss_clip": 1.01171136, "balance_loss_mlp": 1.02133811, "epoch": 0.35953705095445665, "flos": 23001233546880.0, "grad_norm": 2.679761879983404, "language_loss": 0.67270786, "learning_rate": 2.8545658070407773e-06, "loss": 0.69366312, "num_input_tokens_seen": 128490645, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.48828125, "step": 5980, "time_per_iteration": 2.3917486667633057 }, { "auxiliary_loss_clip": 0.01071541, "auxiliary_loss_mlp": 0.0103259, "balance_loss_clip": 1.01663971, "balance_loss_mlp": 1.02089095, "epoch": 0.3595971742071246, "flos": 25512990289920.0, "grad_norm": 1.9069745761872319, "language_loss": 0.71115279, "learning_rate": 2.8542241958348894e-06, "loss": 0.73219407, "num_input_tokens_seen": 128510225, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.50390625, "step": 5981, "time_per_iteration": 3.8275861740112305 }, { "auxiliary_loss_clip": 0.01074017, "auxiliary_loss_mlp": 0.01033756, "balance_loss_clip": 1.01759088, "balance_loss_mlp": 1.02403331, "epoch": 0.3596572974597926, "flos": 29861247703680.0, "grad_norm": 2.186124021653524, "language_loss": 0.71259987, "learning_rate": 2.8538825541439367e-06, "loss": 0.73367763, "num_input_tokens_seen": 128530195, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.5, "step": 5982, "time_per_iteration": 2.4482645988464355 }, { "auxiliary_loss_clip": 0.01068776, "auxiliary_loss_mlp": 0.01032037, "balance_loss_clip": 1.01901364, "balance_loss_mlp": 1.0227052, "epoch": 0.35971742071246054, "flos": 23111419397760.0, "grad_norm": 1.6310207903392884, "language_loss": 0.75598907, "learning_rate": 2.8535408819801127e-06, "loss": 0.77699721, "num_input_tokens_seen": 128549990, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.4609375, "step": 5983, "time_per_iteration": 2.424999713897705 }, { "auxiliary_loss_clip": 0.01076745, "auxiliary_loss_mlp": 0.0103267, "balance_loss_clip": 1.01652908, "balance_loss_mlp": 1.02407491, "epoch": 0.3597775439651285, "flos": 16724478430080.0, "grad_norm": 1.6772260668171775, "language_loss": 0.76604366, "learning_rate": 2.85319917935561e-06, "loss": 0.78713775, "num_input_tokens_seen": 128567925, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.52734375, "step": 5984, "time_per_iteration": 2.3655178546905518 }, { "auxiliary_loss_clip": 0.01069737, "auxiliary_loss_mlp": 0.01029488, "balance_loss_clip": 1.01650012, "balance_loss_mlp": 1.02234197, "epoch": 0.3598376672177965, "flos": 19718455138560.0, "grad_norm": 2.5706400797306372, "language_loss": 0.86202085, "learning_rate": 2.8528574462826234e-06, "loss": 0.88301313, "num_input_tokens_seen": 128585655, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.47460938, "step": 5985, "time_per_iteration": 5.212001085281372 }, { "auxiliary_loss_clip": 0.01068186, "auxiliary_loss_mlp": 0.01034093, "balance_loss_clip": 1.0196445, "balance_loss_mlp": 1.02062666, "epoch": 0.35989779047046444, "flos": 17310565290240.0, "grad_norm": 1.31571106849939, "language_loss": 0.72440183, "learning_rate": 2.852515682773348e-06, "loss": 0.74542469, "num_input_tokens_seen": 128604820, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.4765625, "step": 5986, "time_per_iteration": 2.3966708183288574 }, { "auxiliary_loss_clip": 0.0107331, "auxiliary_loss_mlp": 0.01033464, "balance_loss_clip": 1.01806188, "balance_loss_mlp": 1.02130544, "epoch": 0.3599579137231324, "flos": 22710127697280.0, "grad_norm": 2.956739643243447, "language_loss": 0.74059355, "learning_rate": 2.8521738888399815e-06, "loss": 0.76166123, "num_input_tokens_seen": 128623070, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.51953125, "step": 5987, "time_per_iteration": 2.427263021469116 }, { "auxiliary_loss_clip": 0.0107469, "auxiliary_loss_mlp": 0.01030468, "balance_loss_clip": 1.01565039, "balance_loss_mlp": 1.02407086, "epoch": 0.36001803697580037, "flos": 20958814477440.0, "grad_norm": 2.5739309117662703, "language_loss": 0.69163907, "learning_rate": 2.8518320644947204e-06, "loss": 0.71269071, "num_input_tokens_seen": 128642430, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.50390625, "step": 5988, "time_per_iteration": 2.4609618186950684 }, { "auxiliary_loss_clip": 0.01073051, "auxiliary_loss_mlp": 0.01028814, "balance_loss_clip": 1.01384687, "balance_loss_mlp": 1.02237487, "epoch": 0.36007816022846834, "flos": 20484519390720.0, "grad_norm": 1.8033435565128877, "language_loss": 0.73564243, "learning_rate": 2.851490209749764e-06, "loss": 0.75666106, "num_input_tokens_seen": 128661285, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.5078125, "step": 5989, "time_per_iteration": 3.8666367530822754 }, { "auxiliary_loss_clip": 0.01069402, "auxiliary_loss_mlp": 0.01026398, "balance_loss_clip": 1.01235521, "balance_loss_mlp": 1.02149057, "epoch": 0.36013828348113636, "flos": 27999993012480.0, "grad_norm": 2.594888852621829, "language_loss": 0.80210066, "learning_rate": 2.8511483246173126e-06, "loss": 0.82305861, "num_input_tokens_seen": 128682210, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.48046875, "step": 5990, "time_per_iteration": 2.440448045730591 }, { "auxiliary_loss_clip": 0.01073075, "auxiliary_loss_mlp": 0.01026314, "balance_loss_clip": 1.01153827, "balance_loss_mlp": 1.02331042, "epoch": 0.3601984067338043, "flos": 20081202831360.0, "grad_norm": 1.6472624849151765, "language_loss": 0.840931, "learning_rate": 2.8508064091095664e-06, "loss": 0.86192489, "num_input_tokens_seen": 128700445, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.49609375, "step": 5991, "time_per_iteration": 2.396472930908203 }, { "auxiliary_loss_clip": 0.01072402, "auxiliary_loss_mlp": 0.0103289, "balance_loss_clip": 1.01835823, "balance_loss_mlp": 1.02134836, "epoch": 0.3602585299864723, "flos": 18616806097920.0, "grad_norm": 1.7126643977310503, "language_loss": 0.75447881, "learning_rate": 2.8504644632387286e-06, "loss": 0.77553165, "num_input_tokens_seen": 128716855, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.51171875, "step": 5992, "time_per_iteration": 2.3715078830718994 }, { "auxiliary_loss_clip": 0.01069857, "auxiliary_loss_mlp": 0.0103526, "balance_loss_clip": 1.02072811, "balance_loss_mlp": 1.0215801, "epoch": 0.36031865323914025, "flos": 19571994518400.0, "grad_norm": 1.8280424757494191, "language_loss": 0.77356052, "learning_rate": 2.850122487017002e-06, "loss": 0.79461169, "num_input_tokens_seen": 128735835, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.48242188, "step": 5993, "time_per_iteration": 2.375019073486328 }, { "auxiliary_loss_clip": 0.01074143, "auxiliary_loss_mlp": 0.01036265, "balance_loss_clip": 1.02153707, "balance_loss_mlp": 1.02269566, "epoch": 0.3603787764918082, "flos": 17489739720960.0, "grad_norm": 1.6386141039743205, "language_loss": 0.74430043, "learning_rate": 2.84978048045659e-06, "loss": 0.76540458, "num_input_tokens_seen": 128752465, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.515625, "step": 5994, "time_per_iteration": 2.3964900970458984 }, { "auxiliary_loss_clip": 0.0107325, "auxiliary_loss_mlp": 0.01029603, "balance_loss_clip": 1.0149461, "balance_loss_mlp": 1.02215815, "epoch": 0.3604388997444762, "flos": 15522488542080.0, "grad_norm": 1.689677266658668, "language_loss": 0.68651265, "learning_rate": 2.8494384435696987e-06, "loss": 0.70754117, "num_input_tokens_seen": 128770865, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.5078125, "step": 5995, "time_per_iteration": 2.356747627258301 }, { "auxiliary_loss_clip": 0.01072455, "auxiliary_loss_mlp": 0.01032943, "balance_loss_clip": 1.01810753, "balance_loss_mlp": 1.02186203, "epoch": 0.36049902299714415, "flos": 17309936885760.0, "grad_norm": 1.7972816808866687, "language_loss": 0.82576621, "learning_rate": 2.849096376368534e-06, "loss": 0.84682024, "num_input_tokens_seen": 128789730, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.5078125, "step": 5996, "time_per_iteration": 2.375746965408325 }, { "auxiliary_loss_clip": 0.01071046, "auxiliary_loss_mlp": 0.01027851, "balance_loss_clip": 1.01374257, "balance_loss_mlp": 1.02251923, "epoch": 0.3605591462498121, "flos": 17055070894080.0, "grad_norm": 1.6360357918131012, "language_loss": 0.73591554, "learning_rate": 2.8487542788653044e-06, "loss": 0.75690454, "num_input_tokens_seen": 128806610, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.484375, "step": 5997, "time_per_iteration": 2.3556747436523438 }, { "auxiliary_loss_clip": 0.0106946, "auxiliary_loss_mlp": 0.01029605, "balance_loss_clip": 1.01534796, "balance_loss_mlp": 1.02197778, "epoch": 0.3606192695024801, "flos": 16835921089920.0, "grad_norm": 2.443251493045155, "language_loss": 0.68559325, "learning_rate": 2.848412151072218e-06, "loss": 0.70658386, "num_input_tokens_seen": 128824830, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.47460938, "step": 5998, "time_per_iteration": 2.381971597671509 }, { "auxiliary_loss_clip": 0.01072282, "auxiliary_loss_mlp": 0.01021889, "balance_loss_clip": 1.00744081, "balance_loss_mlp": 1.02302206, "epoch": 0.36067939275514804, "flos": 12128860967040.0, "grad_norm": 2.1652778127333745, "language_loss": 0.77397305, "learning_rate": 2.8480699930014834e-06, "loss": 0.79491478, "num_input_tokens_seen": 128838170, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.4921875, "step": 5999, "time_per_iteration": 2.3386337757110596 }, { "auxiliary_loss_clip": 0.01072589, "auxiliary_loss_mlp": 0.01035818, "balance_loss_clip": 1.02178073, "balance_loss_mlp": 1.02274752, "epoch": 0.360739516007816, "flos": 18040459507200.0, "grad_norm": 5.906092126339494, "language_loss": 0.78284979, "learning_rate": 2.847727804665313e-06, "loss": 0.80393386, "num_input_tokens_seen": 128855625, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.49804688, "step": 6000, "time_per_iteration": 2.3734095096588135 }, { "auxiliary_loss_clip": 0.01070591, "auxiliary_loss_mlp": 0.0102952, "balance_loss_clip": 1.0152092, "balance_loss_mlp": 1.02154922, "epoch": 0.360799639260484, "flos": 18548864858880.0, "grad_norm": 3.522855557431247, "language_loss": 0.78478992, "learning_rate": 2.8473855860759175e-06, "loss": 0.80579108, "num_input_tokens_seen": 128873540, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.49023438, "step": 6001, "time_per_iteration": 2.3571958541870117 }, { "auxiliary_loss_clip": 0.01068303, "auxiliary_loss_mlp": 0.01021467, "balance_loss_clip": 1.00745416, "balance_loss_mlp": 1.02169609, "epoch": 0.36085976251315194, "flos": 19681028294400.0, "grad_norm": 2.384961015490449, "language_loss": 0.83246374, "learning_rate": 2.847043337245511e-06, "loss": 0.85336137, "num_input_tokens_seen": 128889925, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.46679688, "step": 6002, "time_per_iteration": 2.3922977447509766 }, { "auxiliary_loss_clip": 0.01066386, "auxiliary_loss_mlp": 0.01024759, "balance_loss_clip": 1.01165187, "balance_loss_mlp": 1.02045155, "epoch": 0.3609198857658199, "flos": 24198021642240.0, "grad_norm": 1.9957726217601077, "language_loss": 0.90845191, "learning_rate": 2.8467010581863058e-06, "loss": 0.92936337, "num_input_tokens_seen": 128906890, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.45898438, "step": 6003, "time_per_iteration": 2.384376287460327 }, { "auxiliary_loss_clip": 0.01014298, "auxiliary_loss_mlp": 0.0099876, "balance_loss_clip": 0.99746621, "balance_loss_mlp": 1.00391531, "epoch": 0.3609800090184879, "flos": 57112946346240.0, "grad_norm": 0.8654188719359439, "language_loss": 0.53336197, "learning_rate": 2.8463587489105175e-06, "loss": 0.55349255, "num_input_tokens_seen": 128965940, "router_z_loss_clip": 0.01293945, "router_z_loss_mlp": 0.10351562, "step": 6004, "time_per_iteration": 2.9778435230255127 }, { "auxiliary_loss_clip": 0.01070475, "auxiliary_loss_mlp": 0.01027862, "balance_loss_clip": 1.01256752, "balance_loss_mlp": 1.02094913, "epoch": 0.3610401322711559, "flos": 20810259175680.0, "grad_norm": 1.8112342696294805, "language_loss": 0.77950227, "learning_rate": 2.846016409430363e-06, "loss": 0.80048561, "num_input_tokens_seen": 128985835, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.49414062, "step": 6005, "time_per_iteration": 2.40242862701416 }, { "auxiliary_loss_clip": 0.01071135, "auxiliary_loss_mlp": 0.01029362, "balance_loss_clip": 1.01550364, "balance_loss_mlp": 1.02229643, "epoch": 0.36110025552382385, "flos": 13698311581440.0, "grad_norm": 3.3247251940557345, "language_loss": 0.79514426, "learning_rate": 2.8456740397580586e-06, "loss": 0.81614923, "num_input_tokens_seen": 129003120, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.48828125, "step": 6006, "time_per_iteration": 2.397096633911133 }, { "auxiliary_loss_clip": 0.01073198, "auxiliary_loss_mlp": 0.01029841, "balance_loss_clip": 1.01353323, "balance_loss_mlp": 1.02276611, "epoch": 0.3611603787764918, "flos": 22453935073920.0, "grad_norm": 3.3961878709665156, "language_loss": 0.84467876, "learning_rate": 2.845331639905824e-06, "loss": 0.86570913, "num_input_tokens_seen": 129021645, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.50390625, "step": 6007, "time_per_iteration": 2.3960461616516113 }, { "auxiliary_loss_clip": 0.01074799, "auxiliary_loss_mlp": 0.01031876, "balance_loss_clip": 1.01552045, "balance_loss_mlp": 1.02260673, "epoch": 0.3612205020291598, "flos": 20885601623040.0, "grad_norm": 1.929858206572468, "language_loss": 0.73038328, "learning_rate": 2.844989209885877e-06, "loss": 0.75145, "num_input_tokens_seen": 129038375, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.5234375, "step": 6008, "time_per_iteration": 2.411940574645996 }, { "auxiliary_loss_clip": 0.01068807, "auxiliary_loss_mlp": 0.01029272, "balance_loss_clip": 1.01484811, "balance_loss_mlp": 1.02130961, "epoch": 0.36128062528182775, "flos": 15741079764480.0, "grad_norm": 1.8621695631660327, "language_loss": 0.827088, "learning_rate": 2.844646749710439e-06, "loss": 0.84806877, "num_input_tokens_seen": 129056235, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.47460938, "step": 6009, "time_per_iteration": 2.3955862522125244 }, { "auxiliary_loss_clip": 0.01072142, "auxiliary_loss_mlp": 0.01028741, "balance_loss_clip": 1.01379824, "balance_loss_mlp": 1.02278244, "epoch": 0.3613407485344957, "flos": 16763546108160.0, "grad_norm": 2.1010749206175316, "language_loss": 0.76096261, "learning_rate": 2.844304259391731e-06, "loss": 0.78197145, "num_input_tokens_seen": 129072405, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.4921875, "step": 6010, "time_per_iteration": 2.392655611038208 }, { "auxiliary_loss_clip": 0.01071585, "auxiliary_loss_mlp": 0.01031625, "balance_loss_clip": 1.01671231, "balance_loss_mlp": 1.02281487, "epoch": 0.3614008717871637, "flos": 20370283822080.0, "grad_norm": 1.7025574288875394, "language_loss": 0.82709467, "learning_rate": 2.8439617389419757e-06, "loss": 0.84812677, "num_input_tokens_seen": 129090225, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.48828125, "step": 6011, "time_per_iteration": 2.410205125808716 }, { "auxiliary_loss_clip": 0.01075987, "auxiliary_loss_mlp": 0.01034962, "balance_loss_clip": 1.01916671, "balance_loss_mlp": 1.02419984, "epoch": 0.36146099503983165, "flos": 22775764786560.0, "grad_norm": 2.1693312184923372, "language_loss": 0.62887549, "learning_rate": 2.843619188373397e-06, "loss": 0.64998496, "num_input_tokens_seen": 129107685, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.51953125, "step": 6012, "time_per_iteration": 2.446655511856079 }, { "auxiliary_loss_clip": 0.01066514, "auxiliary_loss_mlp": 0.01032172, "balance_loss_clip": 1.01835549, "balance_loss_mlp": 1.0201298, "epoch": 0.3615211182924996, "flos": 22995717552000.0, "grad_norm": 1.799851830212082, "language_loss": 0.83540189, "learning_rate": 2.843276607698219e-06, "loss": 0.85638869, "num_input_tokens_seen": 129125315, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.46289062, "step": 6013, "time_per_iteration": 2.403985023498535 }, { "auxiliary_loss_clip": 0.01070088, "auxiliary_loss_mlp": 0.01030337, "balance_loss_clip": 1.01508391, "balance_loss_mlp": 1.02161551, "epoch": 0.3615812415451676, "flos": 16647320592000.0, "grad_norm": 1.830283251842251, "language_loss": 0.91465521, "learning_rate": 2.8429339969286687e-06, "loss": 0.93565953, "num_input_tokens_seen": 129141600, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.484375, "step": 6014, "time_per_iteration": 2.4018900394439697 }, { "auxiliary_loss_clip": 0.01070047, "auxiliary_loss_mlp": 0.01030247, "balance_loss_clip": 1.01532149, "balance_loss_mlp": 1.02177155, "epoch": 0.36164136479783554, "flos": 21319153286400.0, "grad_norm": 1.6945475273154569, "language_loss": 0.73786128, "learning_rate": 2.8425913560769725e-06, "loss": 0.75886428, "num_input_tokens_seen": 129160665, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.48242188, "step": 6015, "time_per_iteration": 2.405226469039917 }, { "auxiliary_loss_clip": 0.01072748, "auxiliary_loss_mlp": 0.01030815, "balance_loss_clip": 1.0158546, "balance_loss_mlp": 1.02209187, "epoch": 0.3617014880505035, "flos": 24168449854080.0, "grad_norm": 2.2412958755721237, "language_loss": 0.6504758, "learning_rate": 2.8422486851553577e-06, "loss": 0.67151141, "num_input_tokens_seen": 129179220, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.5078125, "step": 6016, "time_per_iteration": 2.425557851791382 }, { "auxiliary_loss_clip": 0.01073209, "auxiliary_loss_mlp": 0.01033838, "balance_loss_clip": 1.01727951, "balance_loss_mlp": 1.02278531, "epoch": 0.3617616113031715, "flos": 39013414951680.0, "grad_norm": 1.7495379388897212, "language_loss": 0.71684813, "learning_rate": 2.8419059841760545e-06, "loss": 0.73791862, "num_input_tokens_seen": 129200385, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.50390625, "step": 6017, "time_per_iteration": 2.5464694499969482 }, { "auxiliary_loss_clip": 0.01073047, "auxiliary_loss_mlp": 0.01029447, "balance_loss_clip": 1.01409292, "balance_loss_mlp": 1.02167881, "epoch": 0.3618217345558395, "flos": 12130013041920.0, "grad_norm": 1.8318374531220054, "language_loss": 0.73157543, "learning_rate": 2.8415632531512916e-06, "loss": 0.75260037, "num_input_tokens_seen": 129217395, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.515625, "step": 6018, "time_per_iteration": 2.374448299407959 }, { "auxiliary_loss_clip": 0.0106963, "auxiliary_loss_mlp": 0.01030278, "balance_loss_clip": 1.01555562, "balance_loss_mlp": 1.02175033, "epoch": 0.36188185780850746, "flos": 24933885701760.0, "grad_norm": 2.0418854675693727, "language_loss": 0.69575953, "learning_rate": 2.841220492093301e-06, "loss": 0.71675861, "num_input_tokens_seen": 129238940, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.47851562, "step": 6019, "time_per_iteration": 2.425837278366089 }, { "auxiliary_loss_clip": 0.01074115, "auxiliary_loss_mlp": 0.010318, "balance_loss_clip": 1.01635003, "balance_loss_mlp": 1.02307045, "epoch": 0.3619419810611754, "flos": 20957802048000.0, "grad_norm": 1.9137437844254053, "language_loss": 0.76357806, "learning_rate": 2.840877701014316e-06, "loss": 0.78463715, "num_input_tokens_seen": 129258240, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.5078125, "step": 6020, "time_per_iteration": 2.4055187702178955 }, { "auxiliary_loss_clip": 0.0107369, "auxiliary_loss_mlp": 0.0103304, "balance_loss_clip": 1.01760221, "balance_loss_mlp": 1.02433956, "epoch": 0.3620021043138434, "flos": 22527776332800.0, "grad_norm": 1.662826101099494, "language_loss": 0.73821962, "learning_rate": 2.840534879926567e-06, "loss": 0.75928688, "num_input_tokens_seen": 129279040, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.4921875, "step": 6021, "time_per_iteration": 3.8287951946258545 }, { "auxiliary_loss_clip": 0.01071361, "auxiliary_loss_mlp": 0.01031302, "balance_loss_clip": 1.01703238, "balance_loss_mlp": 1.02220607, "epoch": 0.36206222756651135, "flos": 15595771219200.0, "grad_norm": 1.668204654199499, "language_loss": 0.80771255, "learning_rate": 2.8401920288422915e-06, "loss": 0.82873923, "num_input_tokens_seen": 129295415, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.4921875, "step": 6022, "time_per_iteration": 2.3824174404144287 }, { "auxiliary_loss_clip": 0.01069059, "auxiliary_loss_mlp": 0.01027948, "balance_loss_clip": 1.01399469, "balance_loss_mlp": 1.02202117, "epoch": 0.3621223508191793, "flos": 23586028686720.0, "grad_norm": 1.8072920444109888, "language_loss": 0.81503475, "learning_rate": 2.8398491477737235e-06, "loss": 0.83600485, "num_input_tokens_seen": 129312620, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.46875, "step": 6023, "time_per_iteration": 2.398380994796753 }, { "auxiliary_loss_clip": 0.01071812, "auxiliary_loss_mlp": 0.0102838, "balance_loss_clip": 1.01320481, "balance_loss_mlp": 1.02203727, "epoch": 0.3621824740718473, "flos": 22308801085440.0, "grad_norm": 1.5989021913224075, "language_loss": 0.79522765, "learning_rate": 2.8395062367330997e-06, "loss": 0.81622958, "num_input_tokens_seen": 129331825, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.49804688, "step": 6024, "time_per_iteration": 3.8028693199157715 }, { "auxiliary_loss_clip": 0.01068689, "auxiliary_loss_mlp": 0.01028026, "balance_loss_clip": 1.01492524, "balance_loss_mlp": 1.02201271, "epoch": 0.36224259732451525, "flos": 16762708235520.0, "grad_norm": 2.760747341964129, "language_loss": 0.75033462, "learning_rate": 2.839163295732658e-06, "loss": 0.77130175, "num_input_tokens_seen": 129350400, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.46679688, "step": 6025, "time_per_iteration": 3.770988702774048 }, { "auxiliary_loss_clip": 0.01069876, "auxiliary_loss_mlp": 0.01029721, "balance_loss_clip": 1.01622677, "balance_loss_mlp": 1.02261126, "epoch": 0.3623027205771832, "flos": 23148601862400.0, "grad_norm": 2.1020669072743066, "language_loss": 0.72191185, "learning_rate": 2.8388203247846365e-06, "loss": 0.74290782, "num_input_tokens_seen": 129371155, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.47265625, "step": 6026, "time_per_iteration": 2.397810220718384 }, { "auxiliary_loss_clip": 0.01077714, "auxiliary_loss_mlp": 0.01034315, "balance_loss_clip": 1.01856709, "balance_loss_mlp": 1.02483678, "epoch": 0.3623628438298512, "flos": 28547884978560.0, "grad_norm": 2.0490089708430705, "language_loss": 0.78985703, "learning_rate": 2.8384773239012757e-06, "loss": 0.81097728, "num_input_tokens_seen": 129391230, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.52734375, "step": 6027, "time_per_iteration": 2.450732707977295 }, { "auxiliary_loss_clip": 0.01073279, "auxiliary_loss_mlp": 0.01035291, "balance_loss_clip": 1.01925683, "balance_loss_mlp": 1.02260876, "epoch": 0.36242296708251914, "flos": 25483732704000.0, "grad_norm": 2.390060660765317, "language_loss": 0.67954075, "learning_rate": 2.838134293094815e-06, "loss": 0.70062649, "num_input_tokens_seen": 129410065, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5078125, "step": 6028, "time_per_iteration": 2.428199291229248 }, { "auxiliary_loss_clip": 0.01071619, "auxiliary_loss_mlp": 0.01027801, "balance_loss_clip": 1.01357889, "balance_loss_mlp": 1.02303064, "epoch": 0.3624830903351871, "flos": 16289425578240.0, "grad_norm": 1.6164910617338464, "language_loss": 0.85275388, "learning_rate": 2.8377912323774986e-06, "loss": 0.87374812, "num_input_tokens_seen": 129428655, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.48632812, "step": 6029, "time_per_iteration": 3.7839155197143555 }, { "auxiliary_loss_clip": 0.01070903, "auxiliary_loss_mlp": 0.01027415, "balance_loss_clip": 1.01374197, "balance_loss_mlp": 1.02290821, "epoch": 0.36254321358785513, "flos": 18295325498880.0, "grad_norm": 1.6603485673202667, "language_loss": 0.72660106, "learning_rate": 2.8374481417615675e-06, "loss": 0.74758422, "num_input_tokens_seen": 129447845, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.48046875, "step": 6030, "time_per_iteration": 2.3626651763916016 }, { "auxiliary_loss_clip": 0.01074222, "auxiliary_loss_mlp": 0.01033462, "balance_loss_clip": 1.01628411, "balance_loss_mlp": 1.02194142, "epoch": 0.3626033368405231, "flos": 14864445636480.0, "grad_norm": 2.3157037466546644, "language_loss": 0.74142039, "learning_rate": 2.8371050212592664e-06, "loss": 0.76249719, "num_input_tokens_seen": 129463275, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.5234375, "step": 6031, "time_per_iteration": 2.3725292682647705 }, { "auxiliary_loss_clip": 0.010703, "auxiliary_loss_mlp": 0.01023962, "balance_loss_clip": 1.00937104, "balance_loss_mlp": 1.02210462, "epoch": 0.36266346009319106, "flos": 22305589240320.0, "grad_norm": 1.6102340557882369, "language_loss": 0.7318635, "learning_rate": 2.8367618708828413e-06, "loss": 0.75280613, "num_input_tokens_seen": 129483205, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.48242188, "step": 6032, "time_per_iteration": 2.393301248550415 }, { "auxiliary_loss_clip": 0.01072592, "auxiliary_loss_mlp": 0.01030492, "balance_loss_clip": 1.01608527, "balance_loss_mlp": 1.02226877, "epoch": 0.362723583345859, "flos": 18221379505920.0, "grad_norm": 2.0396954233271827, "language_loss": 0.78155452, "learning_rate": 2.836418690644536e-06, "loss": 0.80258536, "num_input_tokens_seen": 129499885, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.50390625, "step": 6033, "time_per_iteration": 2.3853368759155273 }, { "auxiliary_loss_clip": 0.01012328, "auxiliary_loss_mlp": 0.01019246, "balance_loss_clip": 1.01789856, "balance_loss_mlp": 1.00241518, "epoch": 0.362783706598527, "flos": 68495818959360.0, "grad_norm": 0.801836756334226, "language_loss": 0.64749706, "learning_rate": 2.8360754805566004e-06, "loss": 0.66781282, "num_input_tokens_seen": 129561885, "router_z_loss_clip": 0.01348877, "router_z_loss_mlp": 0.09912109, "step": 6034, "time_per_iteration": 3.11136794090271 }, { "auxiliary_loss_clip": 0.01071413, "auxiliary_loss_mlp": 0.0102934, "balance_loss_clip": 1.01399183, "balance_loss_mlp": 1.02213168, "epoch": 0.36284382985119495, "flos": 26575432007040.0, "grad_norm": 1.6497360784137405, "language_loss": 0.89779735, "learning_rate": 2.835732240631281e-06, "loss": 0.91880488, "num_input_tokens_seen": 129582325, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.4921875, "step": 6035, "time_per_iteration": 2.427647113800049 }, { "auxiliary_loss_clip": 0.01073043, "auxiliary_loss_mlp": 0.01030498, "balance_loss_clip": 1.01587117, "balance_loss_mlp": 1.02252495, "epoch": 0.3629039531038629, "flos": 20155742317440.0, "grad_norm": 1.7384524033812592, "language_loss": 0.73809171, "learning_rate": 2.8353889708808274e-06, "loss": 0.75912702, "num_input_tokens_seen": 129600350, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.50390625, "step": 6036, "time_per_iteration": 2.3848750591278076 }, { "auxiliary_loss_clip": 0.01072986, "auxiliary_loss_mlp": 0.01029311, "balance_loss_clip": 1.01377857, "balance_loss_mlp": 1.02243233, "epoch": 0.3629640763565309, "flos": 18624696065280.0, "grad_norm": 1.8271408483473113, "language_loss": 0.75926924, "learning_rate": 2.835045671317491e-06, "loss": 0.78029221, "num_input_tokens_seen": 129618425, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.50390625, "step": 6037, "time_per_iteration": 2.4064931869506836 }, { "auxiliary_loss_clip": 0.01070234, "auxiliary_loss_mlp": 0.01041391, "balance_loss_clip": 1.02629948, "balance_loss_mlp": 1.02303052, "epoch": 0.36302419960919885, "flos": 19570493329920.0, "grad_norm": 1.5238921625190627, "language_loss": 0.78709567, "learning_rate": 2.834702341953522e-06, "loss": 0.80821192, "num_input_tokens_seen": 129636750, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.47265625, "step": 6038, "time_per_iteration": 2.392469644546509 }, { "auxiliary_loss_clip": 0.01012324, "auxiliary_loss_mlp": 0.01001427, "balance_loss_clip": 1.00004971, "balance_loss_mlp": 1.00219309, "epoch": 0.3630843228618668, "flos": 63794239920000.0, "grad_norm": 0.8251691126315029, "language_loss": 0.6337781, "learning_rate": 2.8343589828011737e-06, "loss": 0.65391564, "num_input_tokens_seen": 129699030, "router_z_loss_clip": 0.01379395, "router_z_loss_mlp": 0.1015625, "step": 6039, "time_per_iteration": 3.1344316005706787 }, { "auxiliary_loss_clip": 0.01071489, "auxiliary_loss_mlp": 0.01036209, "balance_loss_clip": 1.02149868, "balance_loss_mlp": 1.02284336, "epoch": 0.3631444461145348, "flos": 21834087062400.0, "grad_norm": 2.4876093444982805, "language_loss": 0.71191859, "learning_rate": 2.8340155938726993e-06, "loss": 0.73299551, "num_input_tokens_seen": 129717135, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.48632812, "step": 6040, "time_per_iteration": 2.410862922668457 }, { "auxiliary_loss_clip": 0.01076693, "auxiliary_loss_mlp": 0.0103075, "balance_loss_clip": 1.01563394, "balance_loss_mlp": 1.02409446, "epoch": 0.36320456936720275, "flos": 21721073391360.0, "grad_norm": 1.9267410556946198, "language_loss": 0.81326181, "learning_rate": 2.833672175180354e-06, "loss": 0.83433628, "num_input_tokens_seen": 129735940, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.52734375, "step": 6041, "time_per_iteration": 2.4156813621520996 }, { "auxiliary_loss_clip": 0.01074447, "auxiliary_loss_mlp": 0.01028398, "balance_loss_clip": 1.01278138, "balance_loss_mlp": 1.02320158, "epoch": 0.3632646926198707, "flos": 17018132808960.0, "grad_norm": 1.8663442910249932, "language_loss": 0.83524156, "learning_rate": 2.8333287267363934e-06, "loss": 0.85626996, "num_input_tokens_seen": 129752790, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.51171875, "step": 6042, "time_per_iteration": 2.3927791118621826 }, { "auxiliary_loss_clip": 0.0107211, "auxiliary_loss_mlp": 0.01032434, "balance_loss_clip": 1.01739573, "balance_loss_mlp": 1.0238142, "epoch": 0.36332481587253873, "flos": 23330045531520.0, "grad_norm": 1.5747586370196147, "language_loss": 0.78099209, "learning_rate": 2.832985248553074e-06, "loss": 0.80203754, "num_input_tokens_seen": 129773655, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.48242188, "step": 6043, "time_per_iteration": 2.4068548679351807 }, { "auxiliary_loss_clip": 0.01070407, "auxiliary_loss_mlp": 0.01034211, "balance_loss_clip": 1.01822472, "balance_loss_mlp": 1.02241945, "epoch": 0.3633849391252067, "flos": 10742774146560.0, "grad_norm": 3.0625358771682882, "language_loss": 0.65889776, "learning_rate": 2.8326417406426536e-06, "loss": 0.67994392, "num_input_tokens_seen": 129791605, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.48046875, "step": 6044, "time_per_iteration": 2.3587169647216797 }, { "auxiliary_loss_clip": 0.01071919, "auxiliary_loss_mlp": 0.01029001, "balance_loss_clip": 1.01363468, "balance_loss_mlp": 1.02365458, "epoch": 0.36344506237787466, "flos": 25847946673920.0, "grad_norm": 1.6559850047449243, "language_loss": 0.8122344, "learning_rate": 2.8322982030173908e-06, "loss": 0.83324373, "num_input_tokens_seen": 129811075, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.484375, "step": 6045, "time_per_iteration": 2.45621395111084 }, { "auxiliary_loss_clip": 0.0107271, "auxiliary_loss_mlp": 0.01032149, "balance_loss_clip": 1.01638985, "balance_loss_mlp": 1.02294445, "epoch": 0.3635051856305426, "flos": 30152737578240.0, "grad_norm": 1.7701918222092998, "language_loss": 0.65188402, "learning_rate": 2.8319546356895467e-06, "loss": 0.67293257, "num_input_tokens_seen": 129833755, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.49609375, "step": 6046, "time_per_iteration": 2.4616801738739014 }, { "auxiliary_loss_clip": 0.01072185, "auxiliary_loss_mlp": 0.01032312, "balance_loss_clip": 1.01732159, "balance_loss_mlp": 1.02223921, "epoch": 0.3635653088832106, "flos": 22197358425600.0, "grad_norm": 1.7518953487820985, "language_loss": 0.77624506, "learning_rate": 2.831611038671382e-06, "loss": 0.79729009, "num_input_tokens_seen": 129854475, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.5, "step": 6047, "time_per_iteration": 2.422403573989868 }, { "auxiliary_loss_clip": 0.01076261, "auxiliary_loss_mlp": 0.01038343, "balance_loss_clip": 1.02086687, "balance_loss_mlp": 1.0221169, "epoch": 0.36362543213587856, "flos": 24785993715840.0, "grad_norm": 1.5879397421556394, "language_loss": 0.79469538, "learning_rate": 2.8312674119751585e-06, "loss": 0.81584144, "num_input_tokens_seen": 129873530, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.5390625, "step": 6048, "time_per_iteration": 2.4356281757354736 }, { "auxiliary_loss_clip": 0.01011234, "auxiliary_loss_mlp": 0.01001155, "balance_loss_clip": 0.99994522, "balance_loss_mlp": 1.00127006, "epoch": 0.3636855553885465, "flos": 62522877427200.0, "grad_norm": 0.7530051565630759, "language_loss": 0.52588218, "learning_rate": 2.8309237556131385e-06, "loss": 0.54600608, "num_input_tokens_seen": 129940400, "router_z_loss_clip": 0.01208496, "router_z_loss_mlp": 0.09960938, "step": 6049, "time_per_iteration": 3.103388786315918 }, { "auxiliary_loss_clip": 0.01073577, "auxiliary_loss_mlp": 0.01029463, "balance_loss_clip": 1.01375103, "balance_loss_mlp": 1.02340925, "epoch": 0.3637456786412145, "flos": 24059520812160.0, "grad_norm": 2.0125450543831747, "language_loss": 0.86114162, "learning_rate": 2.8305800695975873e-06, "loss": 0.88217199, "num_input_tokens_seen": 129958635, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.50390625, "step": 6050, "time_per_iteration": 2.4318606853485107 }, { "auxiliary_loss_clip": 0.01072245, "auxiliary_loss_mlp": 0.01032076, "balance_loss_clip": 1.01792586, "balance_loss_mlp": 1.02397776, "epoch": 0.36380580189388245, "flos": 16690542721920.0, "grad_norm": 1.8121395456824636, "language_loss": 0.77918768, "learning_rate": 2.8302363539407703e-06, "loss": 0.80023092, "num_input_tokens_seen": 129977685, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.484375, "step": 6051, "time_per_iteration": 2.387392520904541 }, { "auxiliary_loss_clip": 0.01072581, "auxiliary_loss_mlp": 0.01032486, "balance_loss_clip": 1.0183177, "balance_loss_mlp": 1.02278066, "epoch": 0.3638659251465504, "flos": 25113060132480.0, "grad_norm": 1.7123928093346799, "language_loss": 0.82470536, "learning_rate": 2.829892608654953e-06, "loss": 0.84575599, "num_input_tokens_seen": 129997530, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.49804688, "step": 6052, "time_per_iteration": 2.4124338626861572 }, { "auxiliary_loss_clip": 0.01068165, "auxiliary_loss_mlp": 0.01029815, "balance_loss_clip": 1.01685691, "balance_loss_mlp": 1.02117634, "epoch": 0.3639260483992184, "flos": 23001896862720.0, "grad_norm": 1.4943925841120913, "language_loss": 0.7220093, "learning_rate": 2.829548833752404e-06, "loss": 0.74298918, "num_input_tokens_seen": 130017955, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.47070312, "step": 6053, "time_per_iteration": 2.447387933731079 }, { "auxiliary_loss_clip": 0.01011196, "auxiliary_loss_mlp": 0.010035, "balance_loss_clip": 1.00231433, "balance_loss_mlp": 1.00151181, "epoch": 0.36398617165188635, "flos": 70712839071360.0, "grad_norm": 0.7724249829209577, "language_loss": 0.61200237, "learning_rate": 2.8292050292453904e-06, "loss": 0.63214934, "num_input_tokens_seen": 130074275, "router_z_loss_clip": 0.01184082, "router_z_loss_mlp": 0.09667969, "step": 6054, "time_per_iteration": 3.0855648517608643 }, { "auxiliary_loss_clip": 0.01071131, "auxiliary_loss_mlp": 0.01032404, "balance_loss_clip": 1.01699007, "balance_loss_mlp": 1.02134585, "epoch": 0.3640462949045543, "flos": 22234401244800.0, "grad_norm": 1.8421544709503386, "language_loss": 0.75803816, "learning_rate": 2.828861195146182e-06, "loss": 0.77907354, "num_input_tokens_seen": 130091375, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.49804688, "step": 6055, "time_per_iteration": 2.430845022201538 }, { "auxiliary_loss_clip": 0.01072917, "auxiliary_loss_mlp": 0.01037189, "balance_loss_clip": 1.02200162, "balance_loss_mlp": 1.0230329, "epoch": 0.3641064181572223, "flos": 21542457542400.0, "grad_norm": 1.50952682916665, "language_loss": 0.75209242, "learning_rate": 2.82851733146705e-06, "loss": 0.77319348, "num_input_tokens_seen": 130111595, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.5, "step": 6056, "time_per_iteration": 2.3948845863342285 }, { "auxiliary_loss_clip": 0.0107158, "auxiliary_loss_mlp": 0.01037045, "balance_loss_clip": 1.02169693, "balance_loss_mlp": 1.02250767, "epoch": 0.3641665414098903, "flos": 22272212113920.0, "grad_norm": 1.777259886806799, "language_loss": 0.80024457, "learning_rate": 2.8281734382202657e-06, "loss": 0.8213309, "num_input_tokens_seen": 130131440, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.4921875, "step": 6057, "time_per_iteration": 2.384439468383789 }, { "auxiliary_loss_clip": 0.01071641, "auxiliary_loss_mlp": 0.01029192, "balance_loss_clip": 1.01478541, "balance_loss_mlp": 1.02222943, "epoch": 0.36422666466255826, "flos": 28328420972160.0, "grad_norm": 2.0290703185367143, "language_loss": 0.80716157, "learning_rate": 2.8278295154181017e-06, "loss": 0.82816988, "num_input_tokens_seen": 130151375, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.49414062, "step": 6058, "time_per_iteration": 2.474151372909546 }, { "auxiliary_loss_clip": 0.01071565, "auxiliary_loss_mlp": 0.01032193, "balance_loss_clip": 1.01655245, "balance_loss_mlp": 1.02245402, "epoch": 0.36428678791522623, "flos": 24169357549440.0, "grad_norm": 1.7015808799638785, "language_loss": 0.85123634, "learning_rate": 2.8274855630728316e-06, "loss": 0.87227386, "num_input_tokens_seen": 130169960, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.4921875, "step": 6059, "time_per_iteration": 2.4142816066741943 }, { "auxiliary_loss_clip": 0.01071468, "auxiliary_loss_mlp": 0.01031453, "balance_loss_clip": 1.01624775, "balance_loss_mlp": 1.02149642, "epoch": 0.3643469111678942, "flos": 22527357396480.0, "grad_norm": 1.4250841475354012, "language_loss": 0.88126129, "learning_rate": 2.82714158119673e-06, "loss": 0.90229052, "num_input_tokens_seen": 130189800, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.5, "step": 6060, "time_per_iteration": 2.4167838096618652 }, { "auxiliary_loss_clip": 0.01072664, "auxiliary_loss_mlp": 0.0103502, "balance_loss_clip": 1.01926088, "balance_loss_mlp": 1.02345657, "epoch": 0.36440703442056216, "flos": 19425603720960.0, "grad_norm": 3.0504392611882754, "language_loss": 0.67363739, "learning_rate": 2.826797569802074e-06, "loss": 0.69471419, "num_input_tokens_seen": 130206370, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.4921875, "step": 6061, "time_per_iteration": 3.7727997303009033 }, { "auxiliary_loss_clip": 0.010745, "auxiliary_loss_mlp": 0.01029858, "balance_loss_clip": 1.01421785, "balance_loss_mlp": 1.02394438, "epoch": 0.3644671576732301, "flos": 18039551811840.0, "grad_norm": 2.047186421643374, "language_loss": 0.74945015, "learning_rate": 2.826453528901139e-06, "loss": 0.77049369, "num_input_tokens_seen": 130224445, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.50390625, "step": 6062, "time_per_iteration": 2.374467372894287 }, { "auxiliary_loss_clip": 0.01071705, "auxiliary_loss_mlp": 0.01029355, "balance_loss_clip": 1.01338673, "balance_loss_mlp": 1.02322197, "epoch": 0.3645272809258981, "flos": 21541759315200.0, "grad_norm": 1.7153839465780016, "language_loss": 0.72601569, "learning_rate": 2.826109458506203e-06, "loss": 0.74702626, "num_input_tokens_seen": 130245380, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.484375, "step": 6063, "time_per_iteration": 2.4319028854370117 }, { "auxiliary_loss_clip": 0.01072168, "auxiliary_loss_mlp": 0.0102924, "balance_loss_clip": 1.01460075, "balance_loss_mlp": 1.02368081, "epoch": 0.36458740417856605, "flos": 22745774062080.0, "grad_norm": 1.8262591781232413, "language_loss": 0.67901099, "learning_rate": 2.825765358629546e-06, "loss": 0.70002508, "num_input_tokens_seen": 130265575, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.48632812, "step": 6064, "time_per_iteration": 3.8194692134857178 }, { "auxiliary_loss_clip": 0.01074266, "auxiliary_loss_mlp": 0.01033442, "balance_loss_clip": 1.01827288, "balance_loss_mlp": 1.02300286, "epoch": 0.364647527431234, "flos": 26139471459840.0, "grad_norm": 2.0185268382777224, "language_loss": 0.74218488, "learning_rate": 2.825421229283447e-06, "loss": 0.76326197, "num_input_tokens_seen": 130286195, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.515625, "step": 6065, "time_per_iteration": 3.8645195960998535 }, { "auxiliary_loss_clip": 0.01074578, "auxiliary_loss_mlp": 0.0103359, "balance_loss_clip": 1.01641214, "balance_loss_mlp": 1.02254605, "epoch": 0.364707650683902, "flos": 31028568744960.0, "grad_norm": 2.5926110301281207, "language_loss": 0.75493026, "learning_rate": 2.825077070480188e-06, "loss": 0.77601194, "num_input_tokens_seen": 130306095, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.51953125, "step": 6066, "time_per_iteration": 2.461808204650879 }, { "auxiliary_loss_clip": 0.01069396, "auxiliary_loss_mlp": 0.01025117, "balance_loss_clip": 1.01133609, "balance_loss_mlp": 1.02257264, "epoch": 0.36476777393656995, "flos": 19571889784320.0, "grad_norm": 2.0832752967370727, "language_loss": 0.76463497, "learning_rate": 2.8247328822320505e-06, "loss": 0.78558004, "num_input_tokens_seen": 130324685, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.46875, "step": 6067, "time_per_iteration": 2.4324355125427246 }, { "auxiliary_loss_clip": 0.01070256, "auxiliary_loss_mlp": 0.01030046, "balance_loss_clip": 1.01634336, "balance_loss_mlp": 1.022964, "epoch": 0.3648278971892379, "flos": 17747887380480.0, "grad_norm": 2.5657297006787023, "language_loss": 0.71200514, "learning_rate": 2.8243886645513176e-06, "loss": 0.73300815, "num_input_tokens_seen": 130343855, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.47265625, "step": 6068, "time_per_iteration": 2.388589859008789 }, { "auxiliary_loss_clip": 0.01072116, "auxiliary_loss_mlp": 0.01030708, "balance_loss_clip": 1.01584864, "balance_loss_mlp": 1.02157402, "epoch": 0.3648880204419059, "flos": 17930203833600.0, "grad_norm": 2.291649456482206, "language_loss": 0.73609048, "learning_rate": 2.8240444174502747e-06, "loss": 0.7571187, "num_input_tokens_seen": 130362320, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.5078125, "step": 6069, "time_per_iteration": 3.752021551132202 }, { "auxiliary_loss_clip": 0.01075793, "auxiliary_loss_mlp": 0.01028242, "balance_loss_clip": 1.0126853, "balance_loss_mlp": 1.02370799, "epoch": 0.3649481436945739, "flos": 22637159222400.0, "grad_norm": 3.2000084945749094, "language_loss": 0.66391349, "learning_rate": 2.8237001409412055e-06, "loss": 0.68495381, "num_input_tokens_seen": 130383165, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.5234375, "step": 6070, "time_per_iteration": 2.419123411178589 }, { "auxiliary_loss_clip": 0.01069843, "auxiliary_loss_mlp": 0.01025939, "balance_loss_clip": 1.01214683, "balance_loss_mlp": 1.02234411, "epoch": 0.36500826694724187, "flos": 21578592666240.0, "grad_norm": 1.761209779560282, "language_loss": 0.74285257, "learning_rate": 2.8233558350363974e-06, "loss": 0.76381034, "num_input_tokens_seen": 130402425, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.47460938, "step": 6071, "time_per_iteration": 2.412306785583496 }, { "auxiliary_loss_clip": 0.01070685, "auxiliary_loss_mlp": 0.01028248, "balance_loss_clip": 1.01219058, "balance_loss_mlp": 1.02212548, "epoch": 0.36506839019990983, "flos": 13771664081280.0, "grad_norm": 2.774690388639247, "language_loss": 0.88472986, "learning_rate": 2.823011499748137e-06, "loss": 0.90571928, "num_input_tokens_seen": 130419440, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.484375, "step": 6072, "time_per_iteration": 2.3721706867218018 }, { "auxiliary_loss_clip": 0.01071892, "auxiliary_loss_mlp": 0.0103254, "balance_loss_clip": 1.01757336, "balance_loss_mlp": 1.0234673, "epoch": 0.3651285134525778, "flos": 17274011230080.0, "grad_norm": 2.1909947639588734, "language_loss": 0.72709632, "learning_rate": 2.8226671350887136e-06, "loss": 0.74814063, "num_input_tokens_seen": 130438495, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.484375, "step": 6073, "time_per_iteration": 2.3870556354522705 }, { "auxiliary_loss_clip": 0.01074635, "auxiliary_loss_mlp": 0.01035279, "balance_loss_clip": 1.01974654, "balance_loss_mlp": 1.0242734, "epoch": 0.36518863670524576, "flos": 21906915891840.0, "grad_norm": 2.1040590566084507, "language_loss": 0.67018306, "learning_rate": 2.8223227410704163e-06, "loss": 0.69128215, "num_input_tokens_seen": 130455575, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.50390625, "step": 6074, "time_per_iteration": 2.3765158653259277 }, { "auxiliary_loss_clip": 0.01070799, "auxiliary_loss_mlp": 0.01028698, "balance_loss_clip": 1.01332605, "balance_loss_mlp": 1.0223608, "epoch": 0.3652487599579137, "flos": 27121054734720.0, "grad_norm": 1.4666752069201787, "language_loss": 0.72824764, "learning_rate": 2.8219783177055355e-06, "loss": 0.74924266, "num_input_tokens_seen": 130476385, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.484375, "step": 6075, "time_per_iteration": 2.4290153980255127 }, { "auxiliary_loss_clip": 0.01076606, "auxiliary_loss_mlp": 0.01033471, "balance_loss_clip": 1.01736557, "balance_loss_mlp": 1.02360809, "epoch": 0.3653088832105817, "flos": 19754555351040.0, "grad_norm": 2.235700546025527, "language_loss": 0.89782155, "learning_rate": 2.821633865006363e-06, "loss": 0.91892231, "num_input_tokens_seen": 130493630, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.52734375, "step": 6076, "time_per_iteration": 2.3699605464935303 }, { "auxiliary_loss_clip": 0.01069922, "auxiliary_loss_mlp": 0.01028779, "balance_loss_clip": 1.01412868, "balance_loss_mlp": 1.0224843, "epoch": 0.36536900646324966, "flos": 13114179757440.0, "grad_norm": 2.0333809197559445, "language_loss": 0.69961256, "learning_rate": 2.8212893829851914e-06, "loss": 0.72059953, "num_input_tokens_seen": 130510735, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.47460938, "step": 6077, "time_per_iteration": 2.3759055137634277 }, { "auxiliary_loss_clip": 0.0101222, "auxiliary_loss_mlp": 0.01003624, "balance_loss_clip": 1.00216389, "balance_loss_mlp": 1.00172603, "epoch": 0.3654291297159176, "flos": 71096743048320.0, "grad_norm": 0.7510107083192209, "language_loss": 0.61749446, "learning_rate": 2.8209448716543145e-06, "loss": 0.63765287, "num_input_tokens_seen": 130577050, "router_z_loss_clip": 0.0145874, "router_z_loss_mlp": 0.10498047, "step": 6078, "time_per_iteration": 3.1172690391540527 }, { "auxiliary_loss_clip": 0.01071253, "auxiliary_loss_mlp": 0.01028099, "balance_loss_clip": 1.01387787, "balance_loss_mlp": 1.0223639, "epoch": 0.3654892529685856, "flos": 23616508170240.0, "grad_norm": 2.2250549627565275, "language_loss": 0.78407478, "learning_rate": 2.8206003310260265e-06, "loss": 0.80506825, "num_input_tokens_seen": 130593780, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.48828125, "step": 6079, "time_per_iteration": 2.3918392658233643 }, { "auxiliary_loss_clip": 0.01076139, "auxiliary_loss_mlp": 0.01031004, "balance_loss_clip": 1.01537561, "balance_loss_mlp": 1.02561152, "epoch": 0.36554937622125355, "flos": 43469135130240.0, "grad_norm": 1.7005091387442286, "language_loss": 0.62789857, "learning_rate": 2.820255761112624e-06, "loss": 0.64897001, "num_input_tokens_seen": 130615510, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.50390625, "step": 6080, "time_per_iteration": 2.578517436981201 }, { "auxiliary_loss_clip": 0.0107477, "auxiliary_loss_mlp": 0.0103404, "balance_loss_clip": 1.01839948, "balance_loss_mlp": 1.02311301, "epoch": 0.3656094994739215, "flos": 23293526382720.0, "grad_norm": 2.974710498766856, "language_loss": 0.66998851, "learning_rate": 2.819911161926403e-06, "loss": 0.69107664, "num_input_tokens_seen": 130635410, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.515625, "step": 6081, "time_per_iteration": 2.4205496311187744 }, { "auxiliary_loss_clip": 0.01077766, "auxiliary_loss_mlp": 0.01035334, "balance_loss_clip": 1.01962256, "balance_loss_mlp": 1.02398562, "epoch": 0.3656696227265895, "flos": 24570823806720.0, "grad_norm": 1.5921363746692543, "language_loss": 0.74886107, "learning_rate": 2.8195665334796617e-06, "loss": 0.76999199, "num_input_tokens_seen": 130657725, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.53515625, "step": 6082, "time_per_iteration": 2.4470021724700928 }, { "auxiliary_loss_clip": 0.01074657, "auxiliary_loss_mlp": 0.01027502, "balance_loss_clip": 1.01280308, "balance_loss_mlp": 1.02507901, "epoch": 0.3657297459792575, "flos": 27927129271680.0, "grad_norm": 1.873788943133173, "language_loss": 0.83015347, "learning_rate": 2.8192218757846993e-06, "loss": 0.85117501, "num_input_tokens_seen": 130678360, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.49609375, "step": 6083, "time_per_iteration": 2.4452130794525146 }, { "auxiliary_loss_clip": 0.01012076, "auxiliary_loss_mlp": 0.01001497, "balance_loss_clip": 1.00010812, "balance_loss_mlp": 1.00187576, "epoch": 0.36578986923192547, "flos": 67389631441920.0, "grad_norm": 0.8066155189438377, "language_loss": 0.59282637, "learning_rate": 2.8188771888538148e-06, "loss": 0.61296201, "num_input_tokens_seen": 130742110, "router_z_loss_clip": 0.01391602, "router_z_loss_mlp": 0.10205078, "step": 6084, "time_per_iteration": 3.1411938667297363 }, { "auxiliary_loss_clip": 0.01072586, "auxiliary_loss_mlp": 0.01035622, "balance_loss_clip": 1.01979709, "balance_loss_mlp": 1.02367342, "epoch": 0.36584999248459343, "flos": 20226546288000.0, "grad_norm": 1.8096110485277122, "language_loss": 0.73080671, "learning_rate": 2.8185324726993102e-06, "loss": 0.75188875, "num_input_tokens_seen": 130759870, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.48828125, "step": 6085, "time_per_iteration": 2.3918511867523193 }, { "auxiliary_loss_clip": 0.01075101, "auxiliary_loss_mlp": 0.01033134, "balance_loss_clip": 1.0192461, "balance_loss_mlp": 1.02546644, "epoch": 0.3659101157372614, "flos": 19061459573760.0, "grad_norm": 1.7548351969433356, "language_loss": 0.78040498, "learning_rate": 2.8181877273334875e-06, "loss": 0.80148733, "num_input_tokens_seen": 130778510, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.49609375, "step": 6086, "time_per_iteration": 2.3726563453674316 }, { "auxiliary_loss_clip": 0.01069736, "auxiliary_loss_mlp": 0.01029776, "balance_loss_clip": 1.01596522, "balance_loss_mlp": 1.02252769, "epoch": 0.36597023898992936, "flos": 30809384029440.0, "grad_norm": 1.9169474830485742, "language_loss": 0.76484811, "learning_rate": 2.8178429527686484e-06, "loss": 0.78584319, "num_input_tokens_seen": 130798535, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.47265625, "step": 6087, "time_per_iteration": 2.4766108989715576 }, { "auxiliary_loss_clip": 0.01073796, "auxiliary_loss_mlp": 0.01027462, "balance_loss_clip": 1.01267409, "balance_loss_mlp": 1.02283359, "epoch": 0.36603036224259733, "flos": 20520759248640.0, "grad_norm": 4.837126570229189, "language_loss": 0.70253181, "learning_rate": 2.817498149017099e-06, "loss": 0.72354448, "num_input_tokens_seen": 130816655, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.5078125, "step": 6088, "time_per_iteration": 2.374274253845215 }, { "auxiliary_loss_clip": 0.01077826, "auxiliary_loss_mlp": 0.01032263, "balance_loss_clip": 1.01544309, "balance_loss_mlp": 1.02386379, "epoch": 0.3660904854952653, "flos": 38327790205440.0, "grad_norm": 1.4893284857481737, "language_loss": 0.79942602, "learning_rate": 2.8171533160911432e-06, "loss": 0.82052696, "num_input_tokens_seen": 130841225, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.5390625, "step": 6089, "time_per_iteration": 2.571160316467285 }, { "auxiliary_loss_clip": 0.0107228, "auxiliary_loss_mlp": 0.01027273, "balance_loss_clip": 1.01303935, "balance_loss_mlp": 1.02323604, "epoch": 0.36615060874793326, "flos": 21834471087360.0, "grad_norm": 1.7539765661715723, "language_loss": 0.71559191, "learning_rate": 2.8168084540030873e-06, "loss": 0.73658746, "num_input_tokens_seen": 130861050, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.49023438, "step": 6090, "time_per_iteration": 2.3875107765197754 }, { "auxiliary_loss_clip": 0.01069674, "auxiliary_loss_mlp": 0.01029682, "balance_loss_clip": 1.01669431, "balance_loss_mlp": 1.02335072, "epoch": 0.3662107320006012, "flos": 16580601250560.0, "grad_norm": 1.6736934618082873, "language_loss": 0.74514467, "learning_rate": 2.8164635627652394e-06, "loss": 0.7661382, "num_input_tokens_seen": 130879775, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.4609375, "step": 6091, "time_per_iteration": 2.438204050064087 }, { "auxiliary_loss_clip": 0.01073348, "auxiliary_loss_mlp": 0.01032238, "balance_loss_clip": 1.01793921, "balance_loss_mlp": 1.02414513, "epoch": 0.3662708552532692, "flos": 20957348200320.0, "grad_norm": 1.7218782317398558, "language_loss": 0.72412252, "learning_rate": 2.8161186423899067e-06, "loss": 0.74517834, "num_input_tokens_seen": 130898070, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.4921875, "step": 6092, "time_per_iteration": 2.3788232803344727 }, { "auxiliary_loss_clip": 0.01074102, "auxiliary_loss_mlp": 0.0103209, "balance_loss_clip": 1.01696229, "balance_loss_mlp": 1.02433026, "epoch": 0.36633097850593715, "flos": 21901783921920.0, "grad_norm": 3.503077265508304, "language_loss": 0.78127027, "learning_rate": 2.8157736928893995e-06, "loss": 0.80233216, "num_input_tokens_seen": 130915250, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.49804688, "step": 6093, "time_per_iteration": 2.3988356590270996 }, { "auxiliary_loss_clip": 0.01073091, "auxiliary_loss_mlp": 0.01031992, "balance_loss_clip": 1.0170908, "balance_loss_mlp": 1.0216434, "epoch": 0.3663911017586051, "flos": 32852745705600.0, "grad_norm": 2.905177954514504, "language_loss": 0.74240935, "learning_rate": 2.815428714276027e-06, "loss": 0.76346028, "num_input_tokens_seen": 130936995, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.515625, "step": 6094, "time_per_iteration": 2.4882853031158447 }, { "auxiliary_loss_clip": 0.01076868, "auxiliary_loss_mlp": 0.01033887, "balance_loss_clip": 1.01922989, "balance_loss_mlp": 1.02538097, "epoch": 0.3664512250112731, "flos": 27270517731840.0, "grad_norm": 1.6249883141313535, "language_loss": 0.79396409, "learning_rate": 2.8150837065621016e-06, "loss": 0.8150717, "num_input_tokens_seen": 130957970, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.515625, "step": 6095, "time_per_iteration": 2.4629571437835693 }, { "auxiliary_loss_clip": 0.01073984, "auxiliary_loss_mlp": 0.01028103, "balance_loss_clip": 1.01143146, "balance_loss_mlp": 1.02212191, "epoch": 0.3665113482639411, "flos": 17783498833920.0, "grad_norm": 2.5555783459106873, "language_loss": 0.73315299, "learning_rate": 2.8147386697599346e-06, "loss": 0.75417387, "num_input_tokens_seen": 130974915, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.515625, "step": 6096, "time_per_iteration": 2.3589084148406982 }, { "auxiliary_loss_clip": 0.01073089, "auxiliary_loss_mlp": 0.0102737, "balance_loss_clip": 1.0123142, "balance_loss_mlp": 1.02268195, "epoch": 0.36657147151660907, "flos": 27853392746880.0, "grad_norm": 1.7622378318724488, "language_loss": 0.66725016, "learning_rate": 2.8143936038818412e-06, "loss": 0.68825483, "num_input_tokens_seen": 130995745, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.50390625, "step": 6097, "time_per_iteration": 2.444540023803711 }, { "auxiliary_loss_clip": 0.01074246, "auxiliary_loss_mlp": 0.01035125, "balance_loss_clip": 1.02024198, "balance_loss_mlp": 1.02339411, "epoch": 0.36663159476927704, "flos": 25372848625920.0, "grad_norm": 1.5661748626378365, "language_loss": 0.7748847, "learning_rate": 2.8140485089401344e-06, "loss": 0.79597843, "num_input_tokens_seen": 131015545, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.5078125, "step": 6098, "time_per_iteration": 2.4213931560516357 }, { "auxiliary_loss_clip": 0.01070718, "auxiliary_loss_mlp": 0.01027452, "balance_loss_clip": 1.01336694, "balance_loss_mlp": 1.02303529, "epoch": 0.366691718021945, "flos": 21356265928320.0, "grad_norm": 1.6360923787194308, "language_loss": 0.73556, "learning_rate": 2.8137033849471305e-06, "loss": 0.75654173, "num_input_tokens_seen": 131033990, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.4765625, "step": 6099, "time_per_iteration": 2.388598918914795 }, { "auxiliary_loss_clip": 0.01068536, "auxiliary_loss_mlp": 0.01033656, "balance_loss_clip": 1.01951194, "balance_loss_mlp": 1.02262831, "epoch": 0.36675184127461297, "flos": 16799436852480.0, "grad_norm": 1.8494578564905462, "language_loss": 0.84355438, "learning_rate": 2.8133582319151456e-06, "loss": 0.86457634, "num_input_tokens_seen": 131050710, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.45898438, "step": 6100, "time_per_iteration": 3.8065085411071777 }, { "auxiliary_loss_clip": 0.01074379, "auxiliary_loss_mlp": 0.01026763, "balance_loss_clip": 1.01202273, "balance_loss_mlp": 1.02243161, "epoch": 0.36681196452728093, "flos": 21905484526080.0, "grad_norm": 2.4890628118262144, "language_loss": 0.70205688, "learning_rate": 2.8130130498564975e-06, "loss": 0.72306836, "num_input_tokens_seen": 131071435, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.51953125, "step": 6101, "time_per_iteration": 2.4087295532226562 }, { "auxiliary_loss_clip": 0.01073664, "auxiliary_loss_mlp": 0.01035356, "balance_loss_clip": 1.01923847, "balance_loss_mlp": 1.02254891, "epoch": 0.3668720877799489, "flos": 17711472965760.0, "grad_norm": 2.367555584335422, "language_loss": 0.76163924, "learning_rate": 2.8126678387835057e-06, "loss": 0.78272951, "num_input_tokens_seen": 131088775, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.51171875, "step": 6102, "time_per_iteration": 2.3563108444213867 }, { "auxiliary_loss_clip": 0.01076192, "auxiliary_loss_mlp": 0.01031804, "balance_loss_clip": 1.01512623, "balance_loss_mlp": 1.02343702, "epoch": 0.36693221103261686, "flos": 47043717615360.0, "grad_norm": 1.7124634897630733, "language_loss": 0.70331347, "learning_rate": 2.812322598708489e-06, "loss": 0.72439349, "num_input_tokens_seen": 131112800, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.52734375, "step": 6103, "time_per_iteration": 4.036387920379639 }, { "auxiliary_loss_clip": 0.01074232, "auxiliary_loss_mlp": 0.01030023, "balance_loss_clip": 1.01553893, "balance_loss_mlp": 1.02312946, "epoch": 0.3669923342852848, "flos": 15960020100480.0, "grad_norm": 1.9806521522904572, "language_loss": 0.71845764, "learning_rate": 2.811977329643768e-06, "loss": 0.73950016, "num_input_tokens_seen": 131131150, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.51171875, "step": 6104, "time_per_iteration": 2.397360324859619 }, { "auxiliary_loss_clip": 0.0107291, "auxiliary_loss_mlp": 0.01028384, "balance_loss_clip": 1.01295865, "balance_loss_mlp": 1.02321506, "epoch": 0.3670524575379528, "flos": 19973460775680.0, "grad_norm": 1.728501736126496, "language_loss": 0.81408119, "learning_rate": 2.8116320316016646e-06, "loss": 0.83509409, "num_input_tokens_seen": 131150365, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.49609375, "step": 6105, "time_per_iteration": 3.898510217666626 }, { "auxiliary_loss_clip": 0.01078725, "auxiliary_loss_mlp": 0.01035895, "balance_loss_clip": 1.01938462, "balance_loss_mlp": 1.02527511, "epoch": 0.36711258079062076, "flos": 25701765344640.0, "grad_norm": 1.6666451576869006, "language_loss": 0.8094269, "learning_rate": 2.8112867045945016e-06, "loss": 0.83057308, "num_input_tokens_seen": 131169310, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.53125, "step": 6106, "time_per_iteration": 2.4523539543151855 }, { "auxiliary_loss_clip": 0.01011356, "auxiliary_loss_mlp": 0.01009177, "balance_loss_clip": 1.00791371, "balance_loss_mlp": 1.00122011, "epoch": 0.3671727040432887, "flos": 60769364791680.0, "grad_norm": 0.6889625019557507, "language_loss": 0.59163457, "learning_rate": 2.8109413486346044e-06, "loss": 0.61183989, "num_input_tokens_seen": 131232900, "router_z_loss_clip": 0.01263428, "router_z_loss_mlp": 0.1015625, "step": 6107, "time_per_iteration": 3.0986409187316895 }, { "auxiliary_loss_clip": 0.01072713, "auxiliary_loss_mlp": 0.01026048, "balance_loss_clip": 1.0108968, "balance_loss_mlp": 1.02281106, "epoch": 0.3672328272959567, "flos": 18660307518720.0, "grad_norm": 1.4899114665702824, "language_loss": 0.74680829, "learning_rate": 2.810595963734295e-06, "loss": 0.76779592, "num_input_tokens_seen": 131250920, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.5, "step": 6108, "time_per_iteration": 3.8320603370666504 }, { "auxiliary_loss_clip": 0.01073165, "auxiliary_loss_mlp": 0.01032712, "balance_loss_clip": 1.01732183, "balance_loss_mlp": 1.02256417, "epoch": 0.3672929505486247, "flos": 15048158544000.0, "grad_norm": 2.2105412007391614, "language_loss": 0.73425055, "learning_rate": 2.810250549905901e-06, "loss": 0.75530934, "num_input_tokens_seen": 131267910, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.5078125, "step": 6109, "time_per_iteration": 2.378941774368286 }, { "auxiliary_loss_clip": 0.01072392, "auxiliary_loss_mlp": 0.01027169, "balance_loss_clip": 1.01346624, "balance_loss_mlp": 1.02250171, "epoch": 0.3673530738012927, "flos": 20588456108160.0, "grad_norm": 2.1505767009493573, "language_loss": 0.52575916, "learning_rate": 2.80990510716175e-06, "loss": 0.54675484, "num_input_tokens_seen": 131287150, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.5, "step": 6110, "time_per_iteration": 2.4200210571289062 }, { "auxiliary_loss_clip": 0.01073563, "auxiliary_loss_mlp": 0.01030324, "balance_loss_clip": 1.01585174, "balance_loss_mlp": 1.02452254, "epoch": 0.36741319705396064, "flos": 21688743605760.0, "grad_norm": 1.530504211503779, "language_loss": 0.80748588, "learning_rate": 2.8095596355141676e-06, "loss": 0.82852477, "num_input_tokens_seen": 131308225, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.49023438, "step": 6111, "time_per_iteration": 2.420271635055542 }, { "auxiliary_loss_clip": 0.0107125, "auxiliary_loss_mlp": 0.01032753, "balance_loss_clip": 1.0179708, "balance_loss_mlp": 1.02336311, "epoch": 0.3674733203066286, "flos": 29860898590080.0, "grad_norm": 1.4739418271489957, "language_loss": 0.72328079, "learning_rate": 2.809214134975485e-06, "loss": 0.74432081, "num_input_tokens_seen": 131332115, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.47851562, "step": 6112, "time_per_iteration": 2.4564170837402344 }, { "auxiliary_loss_clip": 0.01073792, "auxiliary_loss_mlp": 0.01038512, "balance_loss_clip": 1.02386117, "balance_loss_mlp": 1.02396154, "epoch": 0.36753344355929657, "flos": 18256118175360.0, "grad_norm": 1.527052847752993, "language_loss": 0.85310948, "learning_rate": 2.8088686055580315e-06, "loss": 0.87423253, "num_input_tokens_seen": 131351885, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.49804688, "step": 6113, "time_per_iteration": 2.3928356170654297 }, { "auxiliary_loss_clip": 0.01075114, "auxiliary_loss_mlp": 0.01032246, "balance_loss_clip": 1.01707113, "balance_loss_mlp": 1.02370787, "epoch": 0.36759356681196453, "flos": 25299984885120.0, "grad_norm": 1.7608819048036308, "language_loss": 0.78340703, "learning_rate": 2.8085230472741377e-06, "loss": 0.80448067, "num_input_tokens_seen": 131370245, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.515625, "step": 6114, "time_per_iteration": 2.4111688137054443 }, { "auxiliary_loss_clip": 0.010784, "auxiliary_loss_mlp": 0.0103524, "balance_loss_clip": 1.01853895, "balance_loss_mlp": 1.02454901, "epoch": 0.3676536900646325, "flos": 21031887686400.0, "grad_norm": 1.763906217716276, "language_loss": 0.67075121, "learning_rate": 2.808177460136137e-06, "loss": 0.69188762, "num_input_tokens_seen": 131388115, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.5390625, "step": 6115, "time_per_iteration": 2.3944694995880127 }, { "auxiliary_loss_clip": 0.01071051, "auxiliary_loss_mlp": 0.01026193, "balance_loss_clip": 1.01204896, "balance_loss_mlp": 1.02275336, "epoch": 0.36771381331730046, "flos": 16287610187520.0, "grad_norm": 2.6289992092190957, "language_loss": 0.76909393, "learning_rate": 2.807831844156361e-06, "loss": 0.79006636, "num_input_tokens_seen": 131404595, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.48242188, "step": 6116, "time_per_iteration": 2.3609845638275146 }, { "auxiliary_loss_clip": 0.01071148, "auxiliary_loss_mlp": 0.01029104, "balance_loss_clip": 1.01534724, "balance_loss_mlp": 1.02222228, "epoch": 0.36777393656996843, "flos": 22308870908160.0, "grad_norm": 1.967186657552637, "language_loss": 0.63121545, "learning_rate": 2.8074861993471444e-06, "loss": 0.65221786, "num_input_tokens_seen": 131423760, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.48828125, "step": 6117, "time_per_iteration": 2.448568820953369 }, { "auxiliary_loss_clip": 0.01071838, "auxiliary_loss_mlp": 0.01029702, "balance_loss_clip": 1.01490819, "balance_loss_mlp": 1.02300096, "epoch": 0.3678340598226364, "flos": 26832846528000.0, "grad_norm": 2.381143549490389, "language_loss": 0.73201048, "learning_rate": 2.807140525720822e-06, "loss": 0.75302595, "num_input_tokens_seen": 131444955, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.48828125, "step": 6118, "time_per_iteration": 2.4490222930908203 }, { "auxiliary_loss_clip": 0.01077614, "auxiliary_loss_mlp": 0.01035592, "balance_loss_clip": 1.01834214, "balance_loss_mlp": 1.02380753, "epoch": 0.36789418307530436, "flos": 21760664739840.0, "grad_norm": 1.8290684607763328, "language_loss": 0.72595912, "learning_rate": 2.8067948232897314e-06, "loss": 0.74709117, "num_input_tokens_seen": 131465720, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.5390625, "step": 6119, "time_per_iteration": 2.434501886367798 }, { "auxiliary_loss_clip": 0.0107327, "auxiliary_loss_mlp": 0.01030152, "balance_loss_clip": 1.01558518, "balance_loss_mlp": 1.02385104, "epoch": 0.3679543063279723, "flos": 15923291483520.0, "grad_norm": 1.7799372833722993, "language_loss": 0.80388439, "learning_rate": 2.806449092066209e-06, "loss": 0.82491863, "num_input_tokens_seen": 131483080, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.49414062, "step": 6120, "time_per_iteration": 2.3701846599578857 }, { "auxiliary_loss_clip": 0.01072336, "auxiliary_loss_mlp": 0.01036467, "balance_loss_clip": 1.0213995, "balance_loss_mlp": 1.02289307, "epoch": 0.3680144295806403, "flos": 24274516164480.0, "grad_norm": 1.9158094248540187, "language_loss": 0.64188147, "learning_rate": 2.8061033320625923e-06, "loss": 0.66296947, "num_input_tokens_seen": 131502545, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.49609375, "step": 6121, "time_per_iteration": 2.4181020259857178 }, { "auxiliary_loss_clip": 0.01077199, "auxiliary_loss_mlp": 0.01033524, "balance_loss_clip": 1.01809859, "balance_loss_mlp": 1.02500939, "epoch": 0.36807455283330826, "flos": 26102952311040.0, "grad_norm": 3.524007778996557, "language_loss": 0.71480983, "learning_rate": 2.8057575432912215e-06, "loss": 0.73591709, "num_input_tokens_seen": 131522155, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.5234375, "step": 6122, "time_per_iteration": 2.4268269538879395 }, { "auxiliary_loss_clip": 0.01071565, "auxiliary_loss_mlp": 0.01028494, "balance_loss_clip": 1.01360464, "balance_loss_mlp": 1.02429342, "epoch": 0.3681346760859763, "flos": 24643827192960.0, "grad_norm": 1.9234544111295393, "language_loss": 0.69032305, "learning_rate": 2.805411725764436e-06, "loss": 0.71132362, "num_input_tokens_seen": 131543865, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.47265625, "step": 6123, "time_per_iteration": 2.4417355060577393 }, { "auxiliary_loss_clip": 0.01077047, "auxiliary_loss_mlp": 0.01031129, "balance_loss_clip": 1.01446366, "balance_loss_mlp": 1.02417839, "epoch": 0.36819479933864424, "flos": 23877239270400.0, "grad_norm": 2.033338858680913, "language_loss": 0.73455763, "learning_rate": 2.805065879494579e-06, "loss": 0.75563937, "num_input_tokens_seen": 131562155, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.52734375, "step": 6124, "time_per_iteration": 2.4074692726135254 }, { "auxiliary_loss_clip": 0.01073393, "auxiliary_loss_mlp": 0.01036235, "balance_loss_clip": 1.02058899, "balance_loss_mlp": 1.02219164, "epoch": 0.3682549225913122, "flos": 25552895840640.0, "grad_norm": 2.353104072639115, "language_loss": 0.7406553, "learning_rate": 2.804720004493991e-06, "loss": 0.76175153, "num_input_tokens_seen": 131581695, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.51171875, "step": 6125, "time_per_iteration": 2.4441373348236084 }, { "auxiliary_loss_clip": 0.01075917, "auxiliary_loss_mlp": 0.01035083, "balance_loss_clip": 1.0186677, "balance_loss_mlp": 1.02454197, "epoch": 0.36831504584398017, "flos": 16945653093120.0, "grad_norm": 1.7670871319194332, "language_loss": 0.78394169, "learning_rate": 2.804374100775016e-06, "loss": 0.80505168, "num_input_tokens_seen": 131599465, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.51171875, "step": 6126, "time_per_iteration": 2.352067470550537 }, { "auxiliary_loss_clip": 0.01075314, "auxiliary_loss_mlp": 0.010314, "balance_loss_clip": 1.01406729, "balance_loss_mlp": 1.02231216, "epoch": 0.36837516909664814, "flos": 19864042974720.0, "grad_norm": 2.2242182044548526, "language_loss": 0.66127962, "learning_rate": 2.8040281683499985e-06, "loss": 0.68234676, "num_input_tokens_seen": 131618330, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.53125, "step": 6127, "time_per_iteration": 2.408078193664551 }, { "auxiliary_loss_clip": 0.01078459, "auxiliary_loss_mlp": 0.01027712, "balance_loss_clip": 1.01140451, "balance_loss_mlp": 1.02582002, "epoch": 0.3684352923493161, "flos": 37625652385920.0, "grad_norm": 1.7119671063003425, "language_loss": 0.70323122, "learning_rate": 2.8036822072312835e-06, "loss": 0.72429293, "num_input_tokens_seen": 131638960, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.52734375, "step": 6128, "time_per_iteration": 2.5249085426330566 }, { "auxiliary_loss_clip": 0.01076101, "auxiliary_loss_mlp": 0.01033464, "balance_loss_clip": 1.01847959, "balance_loss_mlp": 1.02550697, "epoch": 0.36849541560198407, "flos": 14464620213120.0, "grad_norm": 1.723113560882796, "language_loss": 0.75043875, "learning_rate": 2.803336217431218e-06, "loss": 0.77153438, "num_input_tokens_seen": 131657440, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.5078125, "step": 6129, "time_per_iteration": 2.3905277252197266 }, { "auxiliary_loss_clip": 0.01073848, "auxiliary_loss_mlp": 0.01032574, "balance_loss_clip": 1.01796436, "balance_loss_mlp": 1.02329254, "epoch": 0.36855553885465203, "flos": 25769706583680.0, "grad_norm": 1.5522875672930927, "language_loss": 0.84871697, "learning_rate": 2.80299019896215e-06, "loss": 0.86978114, "num_input_tokens_seen": 131678035, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.50390625, "step": 6130, "time_per_iteration": 2.446664333343506 }, { "auxiliary_loss_clip": 0.01012405, "auxiliary_loss_mlp": 0.01001117, "balance_loss_clip": 0.9997645, "balance_loss_mlp": 1.00186753, "epoch": 0.36861566210732, "flos": 65045701071360.0, "grad_norm": 0.809886112055028, "language_loss": 0.60249758, "learning_rate": 2.8026441518364262e-06, "loss": 0.62263286, "num_input_tokens_seen": 131742470, "router_z_loss_clip": 0.0135498, "router_z_loss_mlp": 0.10546875, "step": 6131, "time_per_iteration": 3.138420343399048 }, { "auxiliary_loss_clip": 0.01070932, "auxiliary_loss_mlp": 0.01029284, "balance_loss_clip": 1.01451361, "balance_loss_mlp": 1.0219245, "epoch": 0.36867578535998796, "flos": 30953226297600.0, "grad_norm": 1.5012132493405446, "language_loss": 0.72874516, "learning_rate": 2.8022980760663977e-06, "loss": 0.74974728, "num_input_tokens_seen": 131764570, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.4921875, "step": 6132, "time_per_iteration": 2.466062545776367 }, { "auxiliary_loss_clip": 0.01076173, "auxiliary_loss_mlp": 0.01030889, "balance_loss_clip": 1.01473641, "balance_loss_mlp": 1.02380884, "epoch": 0.3687359086126559, "flos": 28836756501120.0, "grad_norm": 1.768903603637774, "language_loss": 0.74087232, "learning_rate": 2.8019519716644147e-06, "loss": 0.76194292, "num_input_tokens_seen": 131785720, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.5234375, "step": 6133, "time_per_iteration": 2.467952013015747 }, { "auxiliary_loss_clip": 0.0107123, "auxiliary_loss_mlp": 0.01032985, "balance_loss_clip": 1.01807165, "balance_loss_mlp": 1.02409494, "epoch": 0.3687960318653239, "flos": 21395752542720.0, "grad_norm": 2.7257354337482638, "language_loss": 0.71565998, "learning_rate": 2.801605838642829e-06, "loss": 0.73670214, "num_input_tokens_seen": 131804430, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.47265625, "step": 6134, "time_per_iteration": 2.389812707901001 }, { "auxiliary_loss_clip": 0.01073092, "auxiliary_loss_mlp": 0.01028475, "balance_loss_clip": 1.01295376, "balance_loss_mlp": 1.02352655, "epoch": 0.36885615511799186, "flos": 20265020472960.0, "grad_norm": 1.632198933011989, "language_loss": 0.75171113, "learning_rate": 2.8012596770139933e-06, "loss": 0.77272677, "num_input_tokens_seen": 131822060, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.49609375, "step": 6135, "time_per_iteration": 2.402489185333252 }, { "auxiliary_loss_clip": 0.01012098, "auxiliary_loss_mlp": 0.00997505, "balance_loss_clip": 0.99619401, "balance_loss_mlp": 1.00174928, "epoch": 0.3689162783706599, "flos": 63085922789760.0, "grad_norm": 0.8170817831202992, "language_loss": 0.58822632, "learning_rate": 2.80091348679026e-06, "loss": 0.60832238, "num_input_tokens_seen": 131880715, "router_z_loss_clip": 0.01312256, "router_z_loss_mlp": 0.10351562, "step": 6136, "time_per_iteration": 2.981640338897705 }, { "auxiliary_loss_clip": 0.01072818, "auxiliary_loss_mlp": 0.01028872, "balance_loss_clip": 1.01381636, "balance_loss_mlp": 1.0235889, "epoch": 0.36897640162332784, "flos": 10961225723520.0, "grad_norm": 1.8049396648927767, "language_loss": 0.79098797, "learning_rate": 2.800567267983985e-06, "loss": 0.8120048, "num_input_tokens_seen": 131895850, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.4921875, "step": 6137, "time_per_iteration": 2.38376522064209 }, { "auxiliary_loss_clip": 0.01075635, "auxiliary_loss_mlp": 0.01040549, "balance_loss_clip": 1.02424133, "balance_loss_mlp": 1.02535903, "epoch": 0.3690365248759958, "flos": 20703250258560.0, "grad_norm": 1.7896905927516755, "language_loss": 0.7387743, "learning_rate": 2.8002210206075233e-06, "loss": 0.75993609, "num_input_tokens_seen": 131915775, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.5, "step": 6138, "time_per_iteration": 2.426119327545166 }, { "auxiliary_loss_clip": 0.01077474, "auxiliary_loss_mlp": 0.01032328, "balance_loss_clip": 1.01556754, "balance_loss_mlp": 1.02376294, "epoch": 0.3690966481286638, "flos": 31825182303360.0, "grad_norm": 1.7406841382641254, "language_loss": 0.65257591, "learning_rate": 2.7998747446732315e-06, "loss": 0.67367387, "num_input_tokens_seen": 131935715, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.53515625, "step": 6139, "time_per_iteration": 3.9093217849731445 }, { "auxiliary_loss_clip": 0.01070738, "auxiliary_loss_mlp": 0.01034697, "balance_loss_clip": 1.01935506, "balance_loss_mlp": 1.02215028, "epoch": 0.36915677138133174, "flos": 13114109934720.0, "grad_norm": 2.0513219928312214, "language_loss": 0.71410334, "learning_rate": 2.7995284401934677e-06, "loss": 0.73515773, "num_input_tokens_seen": 131954120, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.484375, "step": 6140, "time_per_iteration": 2.3560144901275635 }, { "auxiliary_loss_clip": 0.01011733, "auxiliary_loss_mlp": 0.01012969, "balance_loss_clip": 1.01162171, "balance_loss_mlp": 1.00160146, "epoch": 0.3692168946339997, "flos": 68683372116480.0, "grad_norm": 0.7434864879686168, "language_loss": 0.59351194, "learning_rate": 2.7991821071805906e-06, "loss": 0.61375892, "num_input_tokens_seen": 132017485, "router_z_loss_clip": 0.01348877, "router_z_loss_mlp": 0.1015625, "step": 6141, "time_per_iteration": 3.1399033069610596 }, { "auxiliary_loss_clip": 0.01072174, "auxiliary_loss_mlp": 0.0103299, "balance_loss_clip": 1.01775515, "balance_loss_mlp": 1.02185011, "epoch": 0.36927701788666767, "flos": 22016787540480.0, "grad_norm": 1.6896237242923156, "language_loss": 0.75097811, "learning_rate": 2.7988357456469605e-06, "loss": 0.77202976, "num_input_tokens_seen": 132036760, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.50390625, "step": 6142, "time_per_iteration": 2.39654803276062 }, { "auxiliary_loss_clip": 0.01072031, "auxiliary_loss_mlp": 0.0102752, "balance_loss_clip": 1.01261878, "balance_loss_mlp": 1.02329087, "epoch": 0.36933714113933563, "flos": 21834505998720.0, "grad_norm": 2.1855094497804997, "language_loss": 0.76777643, "learning_rate": 2.7984893556049365e-06, "loss": 0.78877193, "num_input_tokens_seen": 132056935, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.48828125, "step": 6143, "time_per_iteration": 3.908581018447876 }, { "auxiliary_loss_clip": 0.01071934, "auxiliary_loss_mlp": 0.01027748, "balance_loss_clip": 1.01339555, "balance_loss_mlp": 1.02334738, "epoch": 0.3693972643920036, "flos": 23690698542720.0, "grad_norm": 1.5964966895470476, "language_loss": 0.82052958, "learning_rate": 2.7981429370668815e-06, "loss": 0.84152639, "num_input_tokens_seen": 132077285, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.48632812, "step": 6144, "time_per_iteration": 3.936103343963623 }, { "auxiliary_loss_clip": 0.01072841, "auxiliary_loss_mlp": 0.01031839, "balance_loss_clip": 1.01730764, "balance_loss_mlp": 1.0217663, "epoch": 0.36945738764467156, "flos": 22855645710720.0, "grad_norm": 2.649624064880404, "language_loss": 0.77396673, "learning_rate": 2.797796490045158e-06, "loss": 0.79501355, "num_input_tokens_seen": 132095520, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.51171875, "step": 6145, "time_per_iteration": 2.4452884197235107 }, { "auxiliary_loss_clip": 0.0107695, "auxiliary_loss_mlp": 0.0102642, "balance_loss_clip": 1.01095879, "balance_loss_mlp": 1.02534926, "epoch": 0.36951751089733953, "flos": 16615060629120.0, "grad_norm": 2.019876258482959, "language_loss": 0.76790512, "learning_rate": 2.7974500145521304e-06, "loss": 0.78893888, "num_input_tokens_seen": 132112810, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.515625, "step": 6146, "time_per_iteration": 2.370117664337158 }, { "auxiliary_loss_clip": 0.01074658, "auxiliary_loss_mlp": 0.01036015, "balance_loss_clip": 1.01977897, "balance_loss_mlp": 1.02355146, "epoch": 0.3695776341500075, "flos": 18913602499200.0, "grad_norm": 1.495732302996305, "language_loss": 0.80735236, "learning_rate": 2.7971035106001636e-06, "loss": 0.82845902, "num_input_tokens_seen": 132131615, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.51171875, "step": 6147, "time_per_iteration": 3.8307507038116455 }, { "auxiliary_loss_clip": 0.01073759, "auxiliary_loss_mlp": 0.01029099, "balance_loss_clip": 1.01432931, "balance_loss_mlp": 1.02287328, "epoch": 0.36963775740267546, "flos": 20807571000960.0, "grad_norm": 1.788172256452552, "language_loss": 0.83178854, "learning_rate": 2.796756978201622e-06, "loss": 0.85281712, "num_input_tokens_seen": 132149585, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.5078125, "step": 6148, "time_per_iteration": 2.429959535598755 }, { "auxiliary_loss_clip": 0.01071161, "auxiliary_loss_mlp": 0.01032553, "balance_loss_clip": 1.01699615, "balance_loss_mlp": 1.02326953, "epoch": 0.3696978806553435, "flos": 26060847344640.0, "grad_norm": 2.394569391033449, "language_loss": 0.73839736, "learning_rate": 2.7964104173688735e-06, "loss": 0.75943446, "num_input_tokens_seen": 132165555, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.47851562, "step": 6149, "time_per_iteration": 2.4272594451904297 }, { "auxiliary_loss_clip": 0.01074895, "auxiliary_loss_mlp": 0.01036226, "balance_loss_clip": 1.01898813, "balance_loss_mlp": 1.02445626, "epoch": 0.36975800390801145, "flos": 26832706882560.0, "grad_norm": 2.2923998020270555, "language_loss": 0.70792317, "learning_rate": 2.796063828114286e-06, "loss": 0.72903436, "num_input_tokens_seen": 132185100, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.50390625, "step": 6150, "time_per_iteration": 2.439673662185669 }, { "auxiliary_loss_clip": 0.01073981, "auxiliary_loss_mlp": 0.01037773, "balance_loss_clip": 1.02262712, "balance_loss_mlp": 1.02396321, "epoch": 0.3698181271606794, "flos": 21141549866880.0, "grad_norm": 1.5323359610565865, "language_loss": 0.8186698, "learning_rate": 2.795717210450228e-06, "loss": 0.8397873, "num_input_tokens_seen": 132203930, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.5, "step": 6151, "time_per_iteration": 2.443286657333374 }, { "auxiliary_loss_clip": 0.01013235, "auxiliary_loss_mlp": 0.01001615, "balance_loss_clip": 1.00013053, "balance_loss_mlp": 1.0029459, "epoch": 0.3698782504133474, "flos": 66739478503680.0, "grad_norm": 0.7747616968948969, "language_loss": 0.63107443, "learning_rate": 2.7953705643890705e-06, "loss": 0.65122294, "num_input_tokens_seen": 132263845, "router_z_loss_clip": 0.01483154, "router_z_loss_mlp": 0.10253906, "step": 6152, "time_per_iteration": 3.131995439529419 }, { "auxiliary_loss_clip": 0.0107044, "auxiliary_loss_mlp": 0.01033976, "balance_loss_clip": 1.01950431, "balance_loss_mlp": 1.02314484, "epoch": 0.36993837366601534, "flos": 24310511642880.0, "grad_norm": 2.1349998952096096, "language_loss": 0.7014755, "learning_rate": 2.7950238899431827e-06, "loss": 0.72251964, "num_input_tokens_seen": 132282350, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.47265625, "step": 6153, "time_per_iteration": 2.4402735233306885 }, { "auxiliary_loss_clip": 0.01072548, "auxiliary_loss_mlp": 0.01030596, "balance_loss_clip": 1.01481295, "balance_loss_mlp": 1.02292311, "epoch": 0.3699984969186833, "flos": 24348147955200.0, "grad_norm": 1.689951032689083, "language_loss": 0.72462368, "learning_rate": 2.7946771871249374e-06, "loss": 0.74565518, "num_input_tokens_seen": 132301930, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.49609375, "step": 6154, "time_per_iteration": 2.4188525676727295 }, { "auxiliary_loss_clip": 0.01072557, "auxiliary_loss_mlp": 0.01033567, "balance_loss_clip": 1.01965499, "balance_loss_mlp": 1.02387547, "epoch": 0.37005862017135127, "flos": 19828117319040.0, "grad_norm": 1.6489161345322185, "language_loss": 0.67670542, "learning_rate": 2.794330455946707e-06, "loss": 0.69776672, "num_input_tokens_seen": 132320915, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.48632812, "step": 6155, "time_per_iteration": 2.3896164894104004 }, { "auxiliary_loss_clip": 0.01072978, "auxiliary_loss_mlp": 0.01027271, "balance_loss_clip": 1.01282251, "balance_loss_mlp": 1.02416444, "epoch": 0.37011874342401924, "flos": 19572762568320.0, "grad_norm": 1.726609775642434, "language_loss": 0.6764698, "learning_rate": 2.7939836964208665e-06, "loss": 0.69747233, "num_input_tokens_seen": 132340415, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.48632812, "step": 6156, "time_per_iteration": 2.384624481201172 }, { "auxiliary_loss_clip": 0.0107055, "auxiliary_loss_mlp": 0.01032121, "balance_loss_clip": 1.01825714, "balance_loss_mlp": 1.02271199, "epoch": 0.3701788666766872, "flos": 20373356021760.0, "grad_norm": 1.6829890275997992, "language_loss": 0.82059813, "learning_rate": 2.7936369085597895e-06, "loss": 0.8416248, "num_input_tokens_seen": 132358600, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.4765625, "step": 6157, "time_per_iteration": 2.3858392238616943 }, { "auxiliary_loss_clip": 0.01075272, "auxiliary_loss_mlp": 0.01035344, "balance_loss_clip": 1.01779675, "balance_loss_mlp": 1.02302527, "epoch": 0.37023898992935517, "flos": 15340032443520.0, "grad_norm": 2.2596027278673816, "language_loss": 0.76417756, "learning_rate": 2.793290092375853e-06, "loss": 0.78528368, "num_input_tokens_seen": 132373160, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.5234375, "step": 6158, "time_per_iteration": 2.3714630603790283 }, { "auxiliary_loss_clip": 0.01074523, "auxiliary_loss_mlp": 0.0102908, "balance_loss_clip": 1.01333809, "balance_loss_mlp": 1.02275419, "epoch": 0.37029911318202313, "flos": 19572902213760.0, "grad_norm": 2.1155851363161347, "language_loss": 0.69438392, "learning_rate": 2.7929432478814346e-06, "loss": 0.71541995, "num_input_tokens_seen": 132392345, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.51953125, "step": 6159, "time_per_iteration": 2.3774023056030273 }, { "auxiliary_loss_clip": 0.01069322, "auxiliary_loss_mlp": 0.01033653, "balance_loss_clip": 1.01967525, "balance_loss_mlp": 1.02098835, "epoch": 0.3703592364346911, "flos": 26212160643840.0, "grad_norm": 2.283030665095097, "language_loss": 0.70785308, "learning_rate": 2.7925963750889108e-06, "loss": 0.72888285, "num_input_tokens_seen": 132412620, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.484375, "step": 6160, "time_per_iteration": 2.4150218963623047 }, { "auxiliary_loss_clip": 0.01067627, "auxiliary_loss_mlp": 0.01025522, "balance_loss_clip": 1.01220083, "balance_loss_mlp": 1.02091527, "epoch": 0.37041935968735906, "flos": 20047267123200.0, "grad_norm": 1.5429641045666378, "language_loss": 0.79158479, "learning_rate": 2.792249474010661e-06, "loss": 0.81251633, "num_input_tokens_seen": 132431570, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.46875, "step": 6161, "time_per_iteration": 2.4009366035461426 }, { "auxiliary_loss_clip": 0.01072354, "auxiliary_loss_mlp": 0.01033695, "balance_loss_clip": 1.01769161, "balance_loss_mlp": 1.02363896, "epoch": 0.3704794829400271, "flos": 24132663843840.0, "grad_norm": 1.6410845084206767, "language_loss": 0.7925939, "learning_rate": 2.791902544659065e-06, "loss": 0.81365436, "num_input_tokens_seen": 132451525, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.48632812, "step": 6162, "time_per_iteration": 2.418328046798706 }, { "auxiliary_loss_clip": 0.01073634, "auxiliary_loss_mlp": 0.01035884, "balance_loss_clip": 1.02153099, "balance_loss_mlp": 1.02438009, "epoch": 0.37053960619269505, "flos": 14865981736320.0, "grad_norm": 1.7897845932250385, "language_loss": 0.79305756, "learning_rate": 2.7915555870465047e-06, "loss": 0.81415278, "num_input_tokens_seen": 132469875, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.4921875, "step": 6163, "time_per_iteration": 2.374377965927124 }, { "auxiliary_loss_clip": 0.01073365, "auxiliary_loss_mlp": 0.01030597, "balance_loss_clip": 1.01530254, "balance_loss_mlp": 1.0234623, "epoch": 0.370599729445363, "flos": 21360420380160.0, "grad_norm": 1.640969722414908, "language_loss": 0.68600202, "learning_rate": 2.791208601185362e-06, "loss": 0.70704174, "num_input_tokens_seen": 132488360, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.49804688, "step": 6164, "time_per_iteration": 2.399181365966797 }, { "auxiliary_loss_clip": 0.01077313, "auxiliary_loss_mlp": 0.01031061, "balance_loss_clip": 1.01463354, "balance_loss_mlp": 1.02592254, "epoch": 0.370659852698031, "flos": 26827958937600.0, "grad_norm": 2.197070402237643, "language_loss": 0.82759511, "learning_rate": 2.7908615870880185e-06, "loss": 0.84867883, "num_input_tokens_seen": 132508630, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.515625, "step": 6165, "time_per_iteration": 2.458951234817505 }, { "auxiliary_loss_clip": 0.01075641, "auxiliary_loss_mlp": 0.01033168, "balance_loss_clip": 1.01656222, "balance_loss_mlp": 1.02415347, "epoch": 0.37071997595069894, "flos": 19098013633920.0, "grad_norm": 1.9676734329284329, "language_loss": 0.6925658, "learning_rate": 2.7905145447668605e-06, "loss": 0.71365392, "num_input_tokens_seen": 132527465, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.515625, "step": 6166, "time_per_iteration": 2.410658597946167 }, { "auxiliary_loss_clip": 0.01012471, "auxiliary_loss_mlp": 0.01002587, "balance_loss_clip": 1.00098932, "balance_loss_mlp": 1.00274086, "epoch": 0.3707800992033669, "flos": 52175809163520.0, "grad_norm": 0.7916298208244803, "language_loss": 0.56858432, "learning_rate": 2.790167474234271e-06, "loss": 0.58873487, "num_input_tokens_seen": 132579940, "router_z_loss_clip": 0.01599121, "router_z_loss_mlp": 0.09716797, "step": 6167, "time_per_iteration": 2.9531548023223877 }, { "auxiliary_loss_clip": 0.01071433, "auxiliary_loss_mlp": 0.01024003, "balance_loss_clip": 1.01077676, "balance_loss_mlp": 1.02367759, "epoch": 0.3708402224560349, "flos": 19900806503040.0, "grad_norm": 1.8341806717633984, "language_loss": 0.74896836, "learning_rate": 2.7898203755026377e-06, "loss": 0.76992279, "num_input_tokens_seen": 132598390, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.4765625, "step": 6168, "time_per_iteration": 2.458956003189087 }, { "auxiliary_loss_clip": 0.01073177, "auxiliary_loss_mlp": 0.01029685, "balance_loss_clip": 1.0148499, "balance_loss_mlp": 1.02347374, "epoch": 0.37090034570870284, "flos": 20006698256640.0, "grad_norm": 1.6362399849231193, "language_loss": 0.73621279, "learning_rate": 2.7894732485843465e-06, "loss": 0.75724137, "num_input_tokens_seen": 132616920, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.49609375, "step": 6169, "time_per_iteration": 2.4200477600097656 }, { "auxiliary_loss_clip": 0.01070128, "auxiliary_loss_mlp": 0.01031023, "balance_loss_clip": 1.01733732, "balance_loss_mlp": 1.02295423, "epoch": 0.3709604689613708, "flos": 24133536627840.0, "grad_norm": 2.3200258407583605, "language_loss": 0.79219866, "learning_rate": 2.7891260934917854e-06, "loss": 0.81321013, "num_input_tokens_seen": 132637660, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.47265625, "step": 6170, "time_per_iteration": 2.410937547683716 }, { "auxiliary_loss_clip": 0.01075555, "auxiliary_loss_mlp": 0.0103432, "balance_loss_clip": 1.01871562, "balance_loss_mlp": 1.02447486, "epoch": 0.37102059221403877, "flos": 23875004943360.0, "grad_norm": 1.7120475613100137, "language_loss": 0.76155788, "learning_rate": 2.7887789102373444e-06, "loss": 0.78265667, "num_input_tokens_seen": 132657635, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.51171875, "step": 6171, "time_per_iteration": 2.4557530879974365 }, { "auxiliary_loss_clip": 0.01073001, "auxiliary_loss_mlp": 0.01029295, "balance_loss_clip": 1.01421499, "balance_loss_mlp": 1.02395082, "epoch": 0.37108071546670673, "flos": 14500406223360.0, "grad_norm": 2.0934543148346725, "language_loss": 0.80144608, "learning_rate": 2.7884316988334125e-06, "loss": 0.822469, "num_input_tokens_seen": 132674455, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.49023438, "step": 6172, "time_per_iteration": 2.373897075653076 }, { "auxiliary_loss_clip": 0.01073653, "auxiliary_loss_mlp": 0.01036751, "balance_loss_clip": 1.02107489, "balance_loss_mlp": 1.02205753, "epoch": 0.3711408387193747, "flos": 34561360465920.0, "grad_norm": 1.6175478360151696, "language_loss": 0.5934695, "learning_rate": 2.7880844592923815e-06, "loss": 0.6145736, "num_input_tokens_seen": 132695140, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.515625, "step": 6173, "time_per_iteration": 2.538891315460205 }, { "auxiliary_loss_clip": 0.0107288, "auxiliary_loss_mlp": 0.01032275, "balance_loss_clip": 1.01729608, "balance_loss_mlp": 1.02335238, "epoch": 0.37120096197204266, "flos": 17309762328960.0, "grad_norm": 1.7995129521863482, "language_loss": 0.8051486, "learning_rate": 2.787737191626644e-06, "loss": 0.82620013, "num_input_tokens_seen": 132712470, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.49609375, "step": 6174, "time_per_iteration": 2.3775181770324707 }, { "auxiliary_loss_clip": 0.01069493, "auxiliary_loss_mlp": 0.01027895, "balance_loss_clip": 1.01350641, "balance_loss_mlp": 1.02220082, "epoch": 0.37126108522471063, "flos": 30662748852480.0, "grad_norm": 8.105359517295618, "language_loss": 0.80235422, "learning_rate": 2.787389895848591e-06, "loss": 0.82332814, "num_input_tokens_seen": 132732945, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.47265625, "step": 6175, "time_per_iteration": 2.4975883960723877 }, { "auxiliary_loss_clip": 0.01074522, "auxiliary_loss_mlp": 0.01040811, "balance_loss_clip": 1.02551639, "balance_loss_mlp": 1.02468562, "epoch": 0.37132120847737865, "flos": 25154466871680.0, "grad_norm": 1.646359475074023, "language_loss": 0.88616079, "learning_rate": 2.78704257197062e-06, "loss": 0.90731406, "num_input_tokens_seen": 132752470, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.49804688, "step": 6176, "time_per_iteration": 2.419347047805786 }, { "auxiliary_loss_clip": 0.01074123, "auxiliary_loss_mlp": 0.01034807, "balance_loss_clip": 1.02093101, "balance_loss_mlp": 1.02398229, "epoch": 0.3713813317300466, "flos": 21212458571520.0, "grad_norm": 1.5523897519054592, "language_loss": 0.73386168, "learning_rate": 2.7866952200051224e-06, "loss": 0.754951, "num_input_tokens_seen": 132771485, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.5, "step": 6177, "time_per_iteration": 2.4250547885894775 }, { "auxiliary_loss_clip": 0.01071404, "auxiliary_loss_mlp": 0.01035656, "balance_loss_clip": 1.02051044, "balance_loss_mlp": 1.02271843, "epoch": 0.3714414549827146, "flos": 21615565662720.0, "grad_norm": 1.7272531332666148, "language_loss": 0.75224185, "learning_rate": 2.7863478399644973e-06, "loss": 0.77331245, "num_input_tokens_seen": 132791465, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.48632812, "step": 6178, "time_per_iteration": 2.4231603145599365 }, { "auxiliary_loss_clip": 0.0107545, "auxiliary_loss_mlp": 0.01036685, "balance_loss_clip": 1.02227855, "balance_loss_mlp": 1.02573061, "epoch": 0.37150157823538255, "flos": 19971331182720.0, "grad_norm": 1.6729144938403504, "language_loss": 0.71863556, "learning_rate": 2.786000431861139e-06, "loss": 0.73975694, "num_input_tokens_seen": 132810160, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.49804688, "step": 6179, "time_per_iteration": 3.814878463745117 }, { "auxiliary_loss_clip": 0.01074192, "auxiliary_loss_mlp": 0.01032942, "balance_loss_clip": 1.018255, "balance_loss_mlp": 1.02353489, "epoch": 0.3715617014880505, "flos": 24859485861120.0, "grad_norm": 1.6387236846173039, "language_loss": 0.70228338, "learning_rate": 2.7856529957074484e-06, "loss": 0.7233547, "num_input_tokens_seen": 132831265, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.5078125, "step": 6180, "time_per_iteration": 2.4519412517547607 }, { "auxiliary_loss_clip": 0.01070058, "auxiliary_loss_mlp": 0.01029084, "balance_loss_clip": 1.01530373, "balance_loss_mlp": 1.02216136, "epoch": 0.3716218247407185, "flos": 20448035153280.0, "grad_norm": 4.263361426471605, "language_loss": 0.77865577, "learning_rate": 2.7853055315158233e-06, "loss": 0.79964721, "num_input_tokens_seen": 132850005, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.47851562, "step": 6181, "time_per_iteration": 2.4347891807556152 }, { "auxiliary_loss_clip": 0.01070912, "auxiliary_loss_mlp": 0.01031494, "balance_loss_clip": 1.01698649, "balance_loss_mlp": 1.02277517, "epoch": 0.37168194799338644, "flos": 24132349641600.0, "grad_norm": 2.0063929499321476, "language_loss": 0.78351223, "learning_rate": 2.7849580392986633e-06, "loss": 0.80453634, "num_input_tokens_seen": 132865790, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.48046875, "step": 6182, "time_per_iteration": 3.87495756149292 }, { "auxiliary_loss_clip": 0.01012116, "auxiliary_loss_mlp": 0.01015715, "balance_loss_clip": 1.01415372, "balance_loss_mlp": 1.00223398, "epoch": 0.3717420712460544, "flos": 67405481199360.0, "grad_norm": 0.7888858361685097, "language_loss": 0.57507634, "learning_rate": 2.7846105190683705e-06, "loss": 0.59535468, "num_input_tokens_seen": 132921775, "router_z_loss_clip": 0.015625, "router_z_loss_mlp": 0.09863281, "step": 6183, "time_per_iteration": 3.0429883003234863 }, { "auxiliary_loss_clip": 0.01074568, "auxiliary_loss_mlp": 0.01033589, "balance_loss_clip": 1.01740015, "balance_loss_mlp": 1.02184033, "epoch": 0.37180219449872237, "flos": 22375974274560.0, "grad_norm": 3.168425218040862, "language_loss": 0.76708633, "learning_rate": 2.7842629708373466e-06, "loss": 0.78816789, "num_input_tokens_seen": 132941060, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.52734375, "step": 6184, "time_per_iteration": 3.7743980884552 }, { "auxiliary_loss_clip": 0.0107153, "auxiliary_loss_mlp": 0.0102878, "balance_loss_clip": 1.01381278, "balance_loss_mlp": 1.0239737, "epoch": 0.37186231775139034, "flos": 21868860643200.0, "grad_norm": 1.880182044908937, "language_loss": 0.72111183, "learning_rate": 2.7839153946179943e-06, "loss": 0.7421149, "num_input_tokens_seen": 132961850, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.4765625, "step": 6185, "time_per_iteration": 2.4437339305877686 }, { "auxiliary_loss_clip": 0.01072195, "auxiliary_loss_mlp": 0.01020814, "balance_loss_clip": 1.00696802, "balance_loss_mlp": 1.02383614, "epoch": 0.3719224410040583, "flos": 22414238991360.0, "grad_norm": 1.716124519664065, "language_loss": 0.77161324, "learning_rate": 2.783567790422718e-06, "loss": 0.79254329, "num_input_tokens_seen": 132981625, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.48242188, "step": 6186, "time_per_iteration": 2.3991236686706543 }, { "auxiliary_loss_clip": 0.01076559, "auxiliary_loss_mlp": 0.01032691, "balance_loss_clip": 1.016765, "balance_loss_mlp": 1.02423143, "epoch": 0.37198256425672627, "flos": 25150172774400.0, "grad_norm": 1.6683382242907194, "language_loss": 0.83254975, "learning_rate": 2.7832201582639227e-06, "loss": 0.85364223, "num_input_tokens_seen": 133001225, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.5234375, "step": 6187, "time_per_iteration": 3.8323800563812256 }, { "auxiliary_loss_clip": 0.01072231, "auxiliary_loss_mlp": 0.01034634, "balance_loss_clip": 1.02053714, "balance_loss_mlp": 1.02339578, "epoch": 0.37204268750939423, "flos": 21137360503680.0, "grad_norm": 2.4403374394653894, "language_loss": 0.84715891, "learning_rate": 2.782872498154015e-06, "loss": 0.8682276, "num_input_tokens_seen": 133018820, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.48828125, "step": 6188, "time_per_iteration": 2.4364564418792725 }, { "auxiliary_loss_clip": 0.01073667, "auxiliary_loss_mlp": 0.01029507, "balance_loss_clip": 1.01457644, "balance_loss_mlp": 1.02422071, "epoch": 0.37210281076206225, "flos": 21505763836800.0, "grad_norm": 1.6357768777067616, "language_loss": 0.65399086, "learning_rate": 2.782524810105401e-06, "loss": 0.6750226, "num_input_tokens_seen": 133040205, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.49414062, "step": 6189, "time_per_iteration": 2.435692548751831 }, { "auxiliary_loss_clip": 0.01076552, "auxiliary_loss_mlp": 0.01033102, "balance_loss_clip": 1.01834989, "balance_loss_mlp": 1.0265429, "epoch": 0.3721629340147302, "flos": 17346874970880.0, "grad_norm": 1.7951459138673946, "language_loss": 0.83917522, "learning_rate": 2.78217709413049e-06, "loss": 0.86027175, "num_input_tokens_seen": 133058095, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.5, "step": 6190, "time_per_iteration": 2.3571572303771973 }, { "auxiliary_loss_clip": 0.01074819, "auxiliary_loss_mlp": 0.01030338, "balance_loss_clip": 1.01549053, "balance_loss_mlp": 1.02322471, "epoch": 0.3722230572673982, "flos": 16431557189760.0, "grad_norm": 2.362273291297379, "language_loss": 0.87694013, "learning_rate": 2.781829350241691e-06, "loss": 0.89799178, "num_input_tokens_seen": 133071530, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.515625, "step": 6191, "time_per_iteration": 2.3645830154418945 }, { "auxiliary_loss_clip": 0.01073861, "auxiliary_loss_mlp": 0.01031657, "balance_loss_clip": 1.01458609, "balance_loss_mlp": 1.02215326, "epoch": 0.37228318052006615, "flos": 22673608548480.0, "grad_norm": 1.7807218420149122, "language_loss": 0.73598015, "learning_rate": 2.7814815784514125e-06, "loss": 0.75703537, "num_input_tokens_seen": 133091410, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.515625, "step": 6192, "time_per_iteration": 2.40130352973938 }, { "auxiliary_loss_clip": 0.01070789, "auxiliary_loss_mlp": 0.01034159, "balance_loss_clip": 1.02030671, "balance_loss_mlp": 1.02252173, "epoch": 0.3723433037727341, "flos": 25264303608960.0, "grad_norm": 2.00977431482234, "language_loss": 0.79427123, "learning_rate": 2.7811337787720674e-06, "loss": 0.81532073, "num_input_tokens_seen": 133110365, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.48242188, "step": 6193, "time_per_iteration": 2.4418416023254395 }, { "auxiliary_loss_clip": 0.01073343, "auxiliary_loss_mlp": 0.01034885, "balance_loss_clip": 1.02055621, "balance_loss_mlp": 1.02290344, "epoch": 0.3724034270254021, "flos": 10523903633280.0, "grad_norm": 1.7143225133463045, "language_loss": 0.84252727, "learning_rate": 2.7807859512160663e-06, "loss": 0.86360955, "num_input_tokens_seen": 133128255, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.50390625, "step": 6194, "time_per_iteration": 2.358292818069458 }, { "auxiliary_loss_clip": 0.0107146, "auxiliary_loss_mlp": 0.01039287, "balance_loss_clip": 1.02523792, "balance_loss_mlp": 1.02197599, "epoch": 0.37246355027807004, "flos": 20265195029760.0, "grad_norm": 2.2598525844751607, "language_loss": 0.77382863, "learning_rate": 2.7804380957958238e-06, "loss": 0.79493612, "num_input_tokens_seen": 133143975, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.49414062, "step": 6195, "time_per_iteration": 2.4039194583892822 }, { "auxiliary_loss_clip": 0.010717, "auxiliary_loss_mlp": 0.01033611, "balance_loss_clip": 1.01815605, "balance_loss_mlp": 1.02313817, "epoch": 0.372523673530738, "flos": 19499549713920.0, "grad_norm": 1.4917777586955658, "language_loss": 0.78952169, "learning_rate": 2.780090212523753e-06, "loss": 0.81057477, "num_input_tokens_seen": 133162935, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.48632812, "step": 6196, "time_per_iteration": 2.3927886486053467 }, { "auxiliary_loss_clip": 0.01071872, "auxiliary_loss_mlp": 0.01032082, "balance_loss_clip": 1.01787806, "balance_loss_mlp": 1.02293777, "epoch": 0.372583796783406, "flos": 16763301728640.0, "grad_norm": 1.9642784373385083, "language_loss": 0.82945043, "learning_rate": 2.779742301412269e-06, "loss": 0.85048997, "num_input_tokens_seen": 133181180, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.48828125, "step": 6197, "time_per_iteration": 2.562487840652466 }, { "auxiliary_loss_clip": 0.01070065, "auxiliary_loss_mlp": 0.01030933, "balance_loss_clip": 1.01587713, "balance_loss_mlp": 1.0225631, "epoch": 0.37264392003607394, "flos": 22636879931520.0, "grad_norm": 1.5517358097417642, "language_loss": 0.64610058, "learning_rate": 2.7793943624737884e-06, "loss": 0.6671105, "num_input_tokens_seen": 133199615, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.47460938, "step": 6198, "time_per_iteration": 2.4047329425811768 }, { "auxiliary_loss_clip": 0.0106942, "auxiliary_loss_mlp": 0.0103916, "balance_loss_clip": 1.02477741, "balance_loss_mlp": 1.02189946, "epoch": 0.3727040432887419, "flos": 19972134144000.0, "grad_norm": 1.5870133253278214, "language_loss": 0.73951387, "learning_rate": 2.7790463957207275e-06, "loss": 0.76059961, "num_input_tokens_seen": 133219650, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.4765625, "step": 6199, "time_per_iteration": 2.5048775672912598 }, { "auxiliary_loss_clip": 0.01069121, "auxiliary_loss_mlp": 0.01024381, "balance_loss_clip": 1.01048076, "balance_loss_mlp": 1.02145684, "epoch": 0.37276416654140987, "flos": 63896991517440.0, "grad_norm": 1.7664831402604675, "language_loss": 0.80773234, "learning_rate": 2.7786984011655045e-06, "loss": 0.82866734, "num_input_tokens_seen": 133245675, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.4765625, "step": 6200, "time_per_iteration": 2.8172550201416016 }, { "auxiliary_loss_clip": 0.01072431, "auxiliary_loss_mlp": 0.01032329, "balance_loss_clip": 1.01771438, "balance_loss_mlp": 1.02333105, "epoch": 0.37282428979407783, "flos": 39784401705600.0, "grad_norm": 1.9718792875855191, "language_loss": 0.60570848, "learning_rate": 2.7783503788205383e-06, "loss": 0.62675607, "num_input_tokens_seen": 133266905, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.49023438, "step": 6201, "time_per_iteration": 2.592897415161133 }, { "auxiliary_loss_clip": 0.01074313, "auxiliary_loss_mlp": 0.01031279, "balance_loss_clip": 1.01644909, "balance_loss_mlp": 1.02458799, "epoch": 0.37288441304674586, "flos": 22707998104320.0, "grad_norm": 1.7946640034925792, "language_loss": 0.73126602, "learning_rate": 2.7780023286982502e-06, "loss": 0.75232196, "num_input_tokens_seen": 133286865, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.49609375, "step": 6202, "time_per_iteration": 2.426520824432373 }, { "auxiliary_loss_clip": 0.01071296, "auxiliary_loss_mlp": 0.01032465, "balance_loss_clip": 1.01842213, "balance_loss_mlp": 1.02342236, "epoch": 0.3729445362994138, "flos": 18769306383360.0, "grad_norm": 1.905286330760698, "language_loss": 0.73945296, "learning_rate": 2.77765425081106e-06, "loss": 0.76049066, "num_input_tokens_seen": 133305295, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.47851562, "step": 6203, "time_per_iteration": 2.429581642150879 }, { "auxiliary_loss_clip": 0.01069211, "auxiliary_loss_mlp": 0.01026896, "balance_loss_clip": 1.01419401, "balance_loss_mlp": 1.0224328, "epoch": 0.3730046595520818, "flos": 22455087148800.0, "grad_norm": 1.6515603762382831, "language_loss": 0.8148706, "learning_rate": 2.7773061451713893e-06, "loss": 0.83583176, "num_input_tokens_seen": 133324625, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.46679688, "step": 6204, "time_per_iteration": 2.428969144821167 }, { "auxiliary_loss_clip": 0.01073931, "auxiliary_loss_mlp": 0.01035608, "balance_loss_clip": 1.0206176, "balance_loss_mlp": 1.02339745, "epoch": 0.37306478280474975, "flos": 24315224676480.0, "grad_norm": 1.952317168419333, "language_loss": 0.75136554, "learning_rate": 2.776958011791662e-06, "loss": 0.772461, "num_input_tokens_seen": 133344625, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.5078125, "step": 6205, "time_per_iteration": 2.4917941093444824 }, { "auxiliary_loss_clip": 0.01070959, "auxiliary_loss_mlp": 0.01036767, "balance_loss_clip": 1.02197933, "balance_loss_mlp": 1.02249265, "epoch": 0.3731249060574177, "flos": 15814257707520.0, "grad_norm": 2.0624817279397627, "language_loss": 0.7796436, "learning_rate": 2.776609850684302e-06, "loss": 0.80072081, "num_input_tokens_seen": 133363605, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.484375, "step": 6206, "time_per_iteration": 2.3968253135681152 }, { "auxiliary_loss_clip": 0.01071396, "auxiliary_loss_mlp": 0.01034369, "balance_loss_clip": 1.01899123, "balance_loss_mlp": 1.02178884, "epoch": 0.3731850293100857, "flos": 19827069978240.0, "grad_norm": 2.0639849207425702, "language_loss": 0.93374777, "learning_rate": 2.7762616618617346e-06, "loss": 0.95480537, "num_input_tokens_seen": 133379405, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.49609375, "step": 6207, "time_per_iteration": 2.437605381011963 }, { "auxiliary_loss_clip": 0.01073339, "auxiliary_loss_mlp": 0.01028216, "balance_loss_clip": 1.01380968, "balance_loss_mlp": 1.02353597, "epoch": 0.37324515256275365, "flos": 19061354839680.0, "grad_norm": 2.021518420268413, "language_loss": 0.82872462, "learning_rate": 2.7759134453363847e-06, "loss": 0.84974027, "num_input_tokens_seen": 133397585, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.49804688, "step": 6208, "time_per_iteration": 2.428740978240967 }, { "auxiliary_loss_clip": 0.0107438, "auxiliary_loss_mlp": 0.01033187, "balance_loss_clip": 1.01751077, "balance_loss_mlp": 1.02334714, "epoch": 0.3733052758154216, "flos": 20703285169920.0, "grad_norm": 1.9384499984513741, "language_loss": 0.73320979, "learning_rate": 2.7755652011206798e-06, "loss": 0.75428545, "num_input_tokens_seen": 133415365, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.5078125, "step": 6209, "time_per_iteration": 2.4449281692504883 }, { "auxiliary_loss_clip": 0.01072479, "auxiliary_loss_mlp": 0.01029623, "balance_loss_clip": 1.01419723, "balance_loss_mlp": 1.0235002, "epoch": 0.3733653990680896, "flos": 20192470934400.0, "grad_norm": 2.684803602523182, "language_loss": 0.70224679, "learning_rate": 2.7752169292270485e-06, "loss": 0.72326779, "num_input_tokens_seen": 133435700, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.48828125, "step": 6210, "time_per_iteration": 2.4383599758148193 }, { "auxiliary_loss_clip": 0.01073842, "auxiliary_loss_mlp": 0.0102965, "balance_loss_clip": 1.01439178, "balance_loss_mlp": 1.02291262, "epoch": 0.37342552232075754, "flos": 20338617352320.0, "grad_norm": 1.5257492057054218, "language_loss": 0.77836812, "learning_rate": 2.7748686296679184e-06, "loss": 0.79940307, "num_input_tokens_seen": 133455180, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.5078125, "step": 6211, "time_per_iteration": 2.4451894760131836 }, { "auxiliary_loss_clip": 0.01073375, "auxiliary_loss_mlp": 0.01036915, "balance_loss_clip": 1.02191854, "balance_loss_mlp": 1.0232141, "epoch": 0.3734856455734255, "flos": 35516409240960.0, "grad_norm": 1.6208511725525019, "language_loss": 0.73387438, "learning_rate": 2.7745203024557207e-06, "loss": 0.75497729, "num_input_tokens_seen": 133476715, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.5, "step": 6212, "time_per_iteration": 2.5342884063720703 }, { "auxiliary_loss_clip": 0.0107942, "auxiliary_loss_mlp": 0.01045293, "balance_loss_clip": 1.02907455, "balance_loss_mlp": 1.02517772, "epoch": 0.37354576882609347, "flos": 21141235664640.0, "grad_norm": 2.6233294817691717, "language_loss": 0.821738, "learning_rate": 2.7741719476028855e-06, "loss": 0.84298515, "num_input_tokens_seen": 133494550, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.54296875, "step": 6213, "time_per_iteration": 2.439648151397705 }, { "auxiliary_loss_clip": 0.01075524, "auxiliary_loss_mlp": 0.01036, "balance_loss_clip": 1.02059782, "balance_loss_mlp": 1.02497196, "epoch": 0.37360589207876144, "flos": 21505728925440.0, "grad_norm": 2.459126521009793, "language_loss": 0.78650653, "learning_rate": 2.773823565121844e-06, "loss": 0.80762172, "num_input_tokens_seen": 133512640, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.5078125, "step": 6214, "time_per_iteration": 2.4146270751953125 }, { "auxiliary_loss_clip": 0.010713, "auxiliary_loss_mlp": 0.01037962, "balance_loss_clip": 1.02341223, "balance_loss_mlp": 1.02261221, "epoch": 0.37366601533142946, "flos": 38434275452160.0, "grad_norm": 1.6021105794750954, "language_loss": 0.84825546, "learning_rate": 2.7734751550250306e-06, "loss": 0.86934805, "num_input_tokens_seen": 133535540, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.48632812, "step": 6215, "time_per_iteration": 2.5981287956237793 }, { "auxiliary_loss_clip": 0.01074024, "auxiliary_loss_mlp": 0.01036817, "balance_loss_clip": 1.0203898, "balance_loss_mlp": 1.02218866, "epoch": 0.3737261385840974, "flos": 18440215107840.0, "grad_norm": 1.6171487134210691, "language_loss": 0.68683094, "learning_rate": 2.773126717324879e-06, "loss": 0.70793933, "num_input_tokens_seen": 133555795, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.515625, "step": 6216, "time_per_iteration": 2.409013509750366 }, { "auxiliary_loss_clip": 0.01074612, "auxiliary_loss_mlp": 0.01031845, "balance_loss_clip": 1.01682472, "balance_loss_mlp": 1.02353728, "epoch": 0.3737862618367654, "flos": 22928753831040.0, "grad_norm": 2.794683952874038, "language_loss": 0.65873545, "learning_rate": 2.7727782520338227e-06, "loss": 0.67980003, "num_input_tokens_seen": 133575905, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.5078125, "step": 6217, "time_per_iteration": 2.4490344524383545 }, { "auxiliary_loss_clip": 0.01074285, "auxiliary_loss_mlp": 0.01028153, "balance_loss_clip": 1.01288247, "balance_loss_mlp": 1.02340353, "epoch": 0.37384638508943335, "flos": 15408881377920.0, "grad_norm": 1.8113159017269491, "language_loss": 0.80279231, "learning_rate": 2.772429759164299e-06, "loss": 0.82381666, "num_input_tokens_seen": 133592585, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.5078125, "step": 6218, "time_per_iteration": 3.7649877071380615 }, { "auxiliary_loss_clip": 0.01069091, "auxiliary_loss_mlp": 0.01029567, "balance_loss_clip": 1.01571488, "balance_loss_mlp": 1.02250803, "epoch": 0.3739065083421013, "flos": 24279648134400.0, "grad_norm": 1.8279393161731508, "language_loss": 0.7868017, "learning_rate": 2.7720812387287444e-06, "loss": 0.80778825, "num_input_tokens_seen": 133615070, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.46484375, "step": 6219, "time_per_iteration": 2.487558126449585 }, { "auxiliary_loss_clip": 0.0107171, "auxiliary_loss_mlp": 0.01033462, "balance_loss_clip": 1.01756525, "balance_loss_mlp": 1.02380347, "epoch": 0.3739666315947693, "flos": 23001722305920.0, "grad_norm": 2.4528825866951887, "language_loss": 0.76458478, "learning_rate": 2.771732690739596e-06, "loss": 0.78563643, "num_input_tokens_seen": 133633490, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.47851562, "step": 6220, "time_per_iteration": 2.4168243408203125 }, { "auxiliary_loss_clip": 0.01073589, "auxiliary_loss_mlp": 0.01033883, "balance_loss_clip": 1.01837349, "balance_loss_mlp": 1.02267933, "epoch": 0.37402675484743725, "flos": 19390097001600.0, "grad_norm": 1.5716476723891935, "language_loss": 0.82563639, "learning_rate": 2.771384115209293e-06, "loss": 0.8467111, "num_input_tokens_seen": 133653425, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.5078125, "step": 6221, "time_per_iteration": 2.4402990341186523 }, { "auxiliary_loss_clip": 0.01073214, "auxiliary_loss_mlp": 0.01036029, "balance_loss_clip": 1.02105641, "balance_loss_mlp": 1.0236305, "epoch": 0.3740868781001052, "flos": 17125281371520.0, "grad_norm": 1.7713849061384412, "language_loss": 0.76439321, "learning_rate": 2.771035512150275e-06, "loss": 0.78548568, "num_input_tokens_seen": 133670220, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.49609375, "step": 6222, "time_per_iteration": 3.8430683612823486 }, { "auxiliary_loss_clip": 0.0107383, "auxiliary_loss_mlp": 0.01030599, "balance_loss_clip": 1.01489902, "balance_loss_mlp": 1.02368045, "epoch": 0.3741470013527732, "flos": 20042589000960.0, "grad_norm": 1.6175378588570886, "language_loss": 0.70520711, "learning_rate": 2.770686881574983e-06, "loss": 0.72625136, "num_input_tokens_seen": 133688910, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.5, "step": 6223, "time_per_iteration": 3.9344568252563477 }, { "auxiliary_loss_clip": 0.01073837, "auxiliary_loss_mlp": 0.01030566, "balance_loss_clip": 1.01515806, "balance_loss_mlp": 1.02440631, "epoch": 0.37420712460544114, "flos": 36895967637120.0, "grad_norm": 1.8608568225668625, "language_loss": 0.68456465, "learning_rate": 2.770338223495859e-06, "loss": 0.70560867, "num_input_tokens_seen": 133708690, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.49414062, "step": 6224, "time_per_iteration": 2.5350565910339355 }, { "auxiliary_loss_clip": 0.0107091, "auxiliary_loss_mlp": 0.01031762, "balance_loss_clip": 1.01719451, "balance_loss_mlp": 1.02378225, "epoch": 0.3742672478581091, "flos": 22200081511680.0, "grad_norm": 1.75345015961603, "language_loss": 0.70153582, "learning_rate": 2.7699895379253447e-06, "loss": 0.72256255, "num_input_tokens_seen": 133728095, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.47070312, "step": 6225, "time_per_iteration": 2.4380321502685547 }, { "auxiliary_loss_clip": 0.01071065, "auxiliary_loss_mlp": 0.01029559, "balance_loss_clip": 1.01430035, "balance_loss_mlp": 1.02285171, "epoch": 0.3743273711107771, "flos": 24680381253120.0, "grad_norm": 2.474053064681131, "language_loss": 0.78778261, "learning_rate": 2.7696408248758846e-06, "loss": 0.80878878, "num_input_tokens_seen": 133745590, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.48242188, "step": 6226, "time_per_iteration": 2.4330577850341797 }, { "auxiliary_loss_clip": 0.01072624, "auxiliary_loss_mlp": 0.01031714, "balance_loss_clip": 1.01613379, "balance_loss_mlp": 1.02253902, "epoch": 0.37438749436344504, "flos": 24458543274240.0, "grad_norm": 1.8687216456812434, "language_loss": 0.68062204, "learning_rate": 2.7692920843599238e-06, "loss": 0.7016654, "num_input_tokens_seen": 133766155, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.5, "step": 6227, "time_per_iteration": 3.9569149017333984 }, { "auxiliary_loss_clip": 0.01072357, "auxiliary_loss_mlp": 0.01026676, "balance_loss_clip": 1.01247859, "balance_loss_mlp": 1.02367711, "epoch": 0.374447617616113, "flos": 21797672647680.0, "grad_norm": 1.5764803169889852, "language_loss": 0.82882261, "learning_rate": 2.7689433163899073e-06, "loss": 0.84981292, "num_input_tokens_seen": 133783185, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.48632812, "step": 6228, "time_per_iteration": 2.3952555656433105 }, { "auxiliary_loss_clip": 0.01071655, "auxiliary_loss_mlp": 0.01035699, "balance_loss_clip": 1.02030945, "balance_loss_mlp": 1.02389169, "epoch": 0.374507740868781, "flos": 17967211741440.0, "grad_norm": 1.4758673450195732, "language_loss": 0.74699509, "learning_rate": 2.7685945209782816e-06, "loss": 0.76806861, "num_input_tokens_seen": 133800975, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.4765625, "step": 6229, "time_per_iteration": 2.408698558807373 }, { "auxiliary_loss_clip": 0.0107123, "auxiliary_loss_mlp": 0.01029753, "balance_loss_clip": 1.01397026, "balance_loss_mlp": 1.02218044, "epoch": 0.374567864121449, "flos": 16104944620800.0, "grad_norm": 1.8773641338396432, "language_loss": 0.8345238, "learning_rate": 2.7682456981374946e-06, "loss": 0.85553372, "num_input_tokens_seen": 133818020, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.48828125, "step": 6230, "time_per_iteration": 2.366354465484619 }, { "auxiliary_loss_clip": 0.01073039, "auxiliary_loss_mlp": 0.01035277, "balance_loss_clip": 1.01930881, "balance_loss_mlp": 1.02375996, "epoch": 0.37462798737411696, "flos": 25772045644800.0, "grad_norm": 2.4945950179972467, "language_loss": 0.72984105, "learning_rate": 2.7678968478799943e-06, "loss": 0.75092417, "num_input_tokens_seen": 133840690, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.4921875, "step": 6231, "time_per_iteration": 2.485710620880127 }, { "auxiliary_loss_clip": 0.01075258, "auxiliary_loss_mlp": 0.01031794, "balance_loss_clip": 1.01655364, "balance_loss_mlp": 1.02421534, "epoch": 0.3746881106267849, "flos": 16653569725440.0, "grad_norm": 4.6471805573035505, "language_loss": 0.73937887, "learning_rate": 2.767547970218231e-06, "loss": 0.76044941, "num_input_tokens_seen": 133858350, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.51171875, "step": 6232, "time_per_iteration": 2.3835694789886475 }, { "auxiliary_loss_clip": 0.0107136, "auxiliary_loss_mlp": 0.01027617, "balance_loss_clip": 1.01240623, "balance_loss_mlp": 1.02195716, "epoch": 0.3747482338794529, "flos": 26176758658560.0, "grad_norm": 1.6102416345045005, "language_loss": 0.77547932, "learning_rate": 2.767199065164655e-06, "loss": 0.79646909, "num_input_tokens_seen": 133879775, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.49414062, "step": 6233, "time_per_iteration": 2.4585635662078857 }, { "auxiliary_loss_clip": 0.01072705, "auxiliary_loss_mlp": 0.01032043, "balance_loss_clip": 1.01750541, "balance_loss_mlp": 1.02270055, "epoch": 0.37480835713212085, "flos": 12020246127360.0, "grad_norm": 1.7627136175414742, "language_loss": 0.69532347, "learning_rate": 2.7668501327317184e-06, "loss": 0.71637094, "num_input_tokens_seen": 133898295, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.5, "step": 6234, "time_per_iteration": 2.3981785774230957 }, { "auxiliary_loss_clip": 0.01071549, "auxiliary_loss_mlp": 0.01028956, "balance_loss_clip": 1.01517558, "balance_loss_mlp": 1.02282214, "epoch": 0.3748684803847888, "flos": 19678340119680.0, "grad_norm": 1.9495729458196984, "language_loss": 0.82885414, "learning_rate": 2.7665011729318727e-06, "loss": 0.84985918, "num_input_tokens_seen": 133915230, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.48828125, "step": 6235, "time_per_iteration": 2.409662961959839 }, { "auxiliary_loss_clip": 0.01075161, "auxiliary_loss_mlp": 0.01033312, "balance_loss_clip": 1.01858401, "balance_loss_mlp": 1.02454495, "epoch": 0.3749286036374568, "flos": 20520165755520.0, "grad_norm": 1.9546016377829813, "language_loss": 0.78228092, "learning_rate": 2.7661521857775715e-06, "loss": 0.80336571, "num_input_tokens_seen": 133934110, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.5078125, "step": 6236, "time_per_iteration": 2.4079718589782715 }, { "auxiliary_loss_clip": 0.01074648, "auxiliary_loss_mlp": 0.01035374, "balance_loss_clip": 1.01854134, "balance_loss_mlp": 1.0232482, "epoch": 0.37498872689012475, "flos": 20703564460800.0, "grad_norm": 3.050010797417045, "language_loss": 0.73828101, "learning_rate": 2.76580317128127e-06, "loss": 0.75938118, "num_input_tokens_seen": 133952395, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.515625, "step": 6237, "time_per_iteration": 2.4097652435302734 }, { "auxiliary_loss_clip": 0.01073537, "auxiliary_loss_mlp": 0.01028589, "balance_loss_clip": 1.0125674, "balance_loss_mlp": 1.02277398, "epoch": 0.3750488501427927, "flos": 21573914544000.0, "grad_norm": 2.0114703892423718, "language_loss": 0.93086565, "learning_rate": 2.765454129455423e-06, "loss": 0.95188695, "num_input_tokens_seen": 133969635, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5078125, "step": 6238, "time_per_iteration": 2.401618480682373 }, { "auxiliary_loss_clip": 0.01072132, "auxiliary_loss_mlp": 0.01029359, "balance_loss_clip": 1.0143919, "balance_loss_mlp": 1.02237415, "epoch": 0.3751089733954607, "flos": 15922977281280.0, "grad_norm": 2.053237020034645, "language_loss": 0.71100038, "learning_rate": 2.765105060312487e-06, "loss": 0.73201525, "num_input_tokens_seen": 133987215, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.49804688, "step": 6239, "time_per_iteration": 2.4259181022644043 }, { "auxiliary_loss_clip": 0.01077219, "auxiliary_loss_mlp": 0.01030316, "balance_loss_clip": 1.01528382, "balance_loss_mlp": 1.025388, "epoch": 0.37516909664812864, "flos": 36283136808960.0, "grad_norm": 1.5102689779306429, "language_loss": 0.65338063, "learning_rate": 2.76475596386492e-06, "loss": 0.674456, "num_input_tokens_seen": 134009250, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.515625, "step": 6240, "time_per_iteration": 2.530618667602539 }, { "auxiliary_loss_clip": 0.01073841, "auxiliary_loss_mlp": 0.01028872, "balance_loss_clip": 1.01401806, "balance_loss_mlp": 1.02319646, "epoch": 0.3752292199007966, "flos": 13515087432960.0, "grad_norm": 1.6989651921009004, "language_loss": 0.75595129, "learning_rate": 2.764406840125179e-06, "loss": 0.77697843, "num_input_tokens_seen": 134026875, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.50390625, "step": 6241, "time_per_iteration": 2.4018259048461914 }, { "auxiliary_loss_clip": 0.01074721, "auxiliary_loss_mlp": 0.01038657, "balance_loss_clip": 1.0213958, "balance_loss_mlp": 1.02353191, "epoch": 0.3752893431534646, "flos": 27196885941120.0, "grad_norm": 2.089112001114548, "language_loss": 0.84214926, "learning_rate": 2.7640576891057246e-06, "loss": 0.86328304, "num_input_tokens_seen": 134047185, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.51171875, "step": 6242, "time_per_iteration": 2.448578357696533 }, { "auxiliary_loss_clip": 0.01074038, "auxiliary_loss_mlp": 0.01035701, "balance_loss_clip": 1.02131283, "balance_loss_mlp": 1.02355623, "epoch": 0.3753494664061326, "flos": 30006381692160.0, "grad_norm": 1.7973205343506828, "language_loss": 0.68096197, "learning_rate": 2.763708510819017e-06, "loss": 0.70205939, "num_input_tokens_seen": 134067330, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.50390625, "step": 6243, "time_per_iteration": 2.481720209121704 }, { "auxiliary_loss_clip": 0.01073019, "auxiliary_loss_mlp": 0.01036294, "balance_loss_clip": 1.01986682, "balance_loss_mlp": 1.02390313, "epoch": 0.37540958965880056, "flos": 24460812512640.0, "grad_norm": 1.995312742969244, "language_loss": 0.83813632, "learning_rate": 2.7633593052775174e-06, "loss": 0.85922945, "num_input_tokens_seen": 134085525, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.4921875, "step": 6244, "time_per_iteration": 2.432147741317749 }, { "auxiliary_loss_clip": 0.01070223, "auxiliary_loss_mlp": 0.01028842, "balance_loss_clip": 1.0146023, "balance_loss_mlp": 1.02315938, "epoch": 0.3754697129114685, "flos": 16507458218880.0, "grad_norm": 2.526467172015851, "language_loss": 0.83259434, "learning_rate": 2.763010072493687e-06, "loss": 0.853585, "num_input_tokens_seen": 134101855, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.46875, "step": 6245, "time_per_iteration": 2.388734817504883 }, { "auxiliary_loss_clip": 0.01072486, "auxiliary_loss_mlp": 0.01035757, "balance_loss_clip": 1.02055764, "balance_loss_mlp": 1.02326655, "epoch": 0.3755298361641365, "flos": 19389887533440.0, "grad_norm": 4.46910041998433, "language_loss": 0.63639015, "learning_rate": 2.76266081247999e-06, "loss": 0.65747261, "num_input_tokens_seen": 134119360, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.4921875, "step": 6246, "time_per_iteration": 2.402914047241211 }, { "auxiliary_loss_clip": 0.01074434, "auxiliary_loss_mlp": 0.01035935, "balance_loss_clip": 1.01923394, "balance_loss_mlp": 1.02348852, "epoch": 0.37558995941680445, "flos": 14719521116160.0, "grad_norm": 1.6823006376077325, "language_loss": 0.74856913, "learning_rate": 2.7623115252488905e-06, "loss": 0.76967275, "num_input_tokens_seen": 134137475, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.51171875, "step": 6247, "time_per_iteration": 2.3875598907470703 }, { "auxiliary_loss_clip": 0.01074235, "auxiliary_loss_mlp": 0.01027744, "balance_loss_clip": 1.01266456, "balance_loss_mlp": 1.02294374, "epoch": 0.3756500826694724, "flos": 21688813428480.0, "grad_norm": 13.28785055875454, "language_loss": 0.54971582, "learning_rate": 2.7619622108128534e-06, "loss": 0.57073557, "num_input_tokens_seen": 134154580, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.51171875, "step": 6248, "time_per_iteration": 2.4299628734588623 }, { "auxiliary_loss_clip": 0.01072113, "auxiliary_loss_mlp": 0.01032303, "balance_loss_clip": 1.01741385, "balance_loss_mlp": 1.02287054, "epoch": 0.3757102059221404, "flos": 26504453479680.0, "grad_norm": 2.633121587450112, "language_loss": 0.84367877, "learning_rate": 2.7616128691843452e-06, "loss": 0.86472297, "num_input_tokens_seen": 134174285, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.4921875, "step": 6249, "time_per_iteration": 2.438007354736328 }, { "auxiliary_loss_clip": 0.01071086, "auxiliary_loss_mlp": 0.0103139, "balance_loss_clip": 1.01693547, "balance_loss_mlp": 1.02238822, "epoch": 0.37577032917480835, "flos": 37336676129280.0, "grad_norm": 1.6156659381612262, "language_loss": 0.67711782, "learning_rate": 2.761263500375832e-06, "loss": 0.69814253, "num_input_tokens_seen": 134195940, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.48632812, "step": 6250, "time_per_iteration": 2.5422446727752686 }, { "auxiliary_loss_clip": 0.01075228, "auxiliary_loss_mlp": 0.01035926, "balance_loss_clip": 1.02216983, "balance_loss_mlp": 1.02463162, "epoch": 0.3758304524274763, "flos": 21907509384960.0, "grad_norm": 2.4290308739049267, "language_loss": 0.77889025, "learning_rate": 2.760914104399784e-06, "loss": 0.80000186, "num_input_tokens_seen": 134212235, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.50390625, "step": 6251, "time_per_iteration": 2.395646095275879 }, { "auxiliary_loss_clip": 0.01073676, "auxiliary_loss_mlp": 0.01036992, "balance_loss_clip": 1.02190053, "balance_loss_mlp": 1.02341557, "epoch": 0.3758905756801443, "flos": 36568028436480.0, "grad_norm": 1.835856414744348, "language_loss": 0.5781337, "learning_rate": 2.7605646812686687e-06, "loss": 0.59924042, "num_input_tokens_seen": 134233810, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.50390625, "step": 6252, "time_per_iteration": 2.5421323776245117 }, { "auxiliary_loss_clip": 0.01074985, "auxiliary_loss_mlp": 0.01034224, "balance_loss_clip": 1.01913857, "balance_loss_mlp": 1.02336311, "epoch": 0.37595069893281224, "flos": 24527811144960.0, "grad_norm": 1.7371595917188212, "language_loss": 0.89704341, "learning_rate": 2.7602152309949552e-06, "loss": 0.91813552, "num_input_tokens_seen": 134252020, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.515625, "step": 6253, "time_per_iteration": 2.42207407951355 }, { "auxiliary_loss_clip": 0.010721, "auxiliary_loss_mlp": 0.01034114, "balance_loss_clip": 1.01974916, "balance_loss_mlp": 1.02340984, "epoch": 0.3760108221854802, "flos": 16434105719040.0, "grad_norm": 1.754969666250421, "language_loss": 0.76611519, "learning_rate": 2.7598657535911166e-06, "loss": 0.78717726, "num_input_tokens_seen": 134269495, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.48632812, "step": 6254, "time_per_iteration": 2.3783488273620605 }, { "auxiliary_loss_clip": 0.01076465, "auxiliary_loss_mlp": 0.01037614, "balance_loss_clip": 1.02150869, "balance_loss_mlp": 1.02485919, "epoch": 0.37607094543814823, "flos": 13770896031360.0, "grad_norm": 2.2643559385653274, "language_loss": 0.61856663, "learning_rate": 2.759516249069623e-06, "loss": 0.63970739, "num_input_tokens_seen": 134287035, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.515625, "step": 6255, "time_per_iteration": 2.3731582164764404 }, { "auxiliary_loss_clip": 0.01075863, "auxiliary_loss_mlp": 0.01033719, "balance_loss_clip": 1.01719642, "balance_loss_mlp": 1.02263391, "epoch": 0.3761310686908162, "flos": 19857095614080.0, "grad_norm": 2.859601823354851, "language_loss": 0.73764277, "learning_rate": 2.7591667174429487e-06, "loss": 0.75873852, "num_input_tokens_seen": 134304840, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.53125, "step": 6256, "time_per_iteration": 2.405026912689209 }, { "auxiliary_loss_clip": 0.01077443, "auxiliary_loss_mlp": 0.01035933, "balance_loss_clip": 1.02024484, "balance_loss_mlp": 1.02585077, "epoch": 0.37619119194348416, "flos": 12749965787520.0, "grad_norm": 1.8261874370215634, "language_loss": 0.70658994, "learning_rate": 2.758817158723568e-06, "loss": 0.72772372, "num_input_tokens_seen": 134323180, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.515625, "step": 6257, "time_per_iteration": 3.853148937225342 }, { "auxiliary_loss_clip": 0.01072975, "auxiliary_loss_mlp": 0.01028035, "balance_loss_clip": 1.01311588, "balance_loss_mlp": 1.02380705, "epoch": 0.3762513151961521, "flos": 17529575448960.0, "grad_norm": 2.414141865492626, "language_loss": 0.84515548, "learning_rate": 2.7584675729239537e-06, "loss": 0.86616558, "num_input_tokens_seen": 134341390, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.4921875, "step": 6258, "time_per_iteration": 2.411872148513794 }, { "auxiliary_loss_clip": 0.01071784, "auxiliary_loss_mlp": 0.01030284, "balance_loss_clip": 1.01638472, "balance_loss_mlp": 1.02268147, "epoch": 0.3763114384488201, "flos": 23616438347520.0, "grad_norm": 1.4496009404322672, "language_loss": 0.80593866, "learning_rate": 2.7581179600565833e-06, "loss": 0.82695937, "num_input_tokens_seen": 134360425, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.4921875, "step": 6259, "time_per_iteration": 2.4118783473968506 }, { "auxiliary_loss_clip": 0.0107643, "auxiliary_loss_mlp": 0.010338, "balance_loss_clip": 1.01738453, "balance_loss_mlp": 1.02409172, "epoch": 0.37637156170148806, "flos": 25405911550080.0, "grad_norm": 2.117599986329863, "language_loss": 0.70945382, "learning_rate": 2.7577683201339324e-06, "loss": 0.73055607, "num_input_tokens_seen": 134379775, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.5234375, "step": 6260, "time_per_iteration": 2.4490718841552734 }, { "auxiliary_loss_clip": 0.01072644, "auxiliary_loss_mlp": 0.01028651, "balance_loss_clip": 1.01315427, "balance_loss_mlp": 1.02171111, "epoch": 0.376431684954156, "flos": 23439777534720.0, "grad_norm": 1.690756104317432, "language_loss": 0.78478593, "learning_rate": 2.75741865316848e-06, "loss": 0.80579889, "num_input_tokens_seen": 134400315, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.5078125, "step": 6261, "time_per_iteration": 3.8701090812683105 }, { "auxiliary_loss_clip": 0.01078287, "auxiliary_loss_mlp": 0.01031069, "balance_loss_clip": 1.01501119, "balance_loss_mlp": 1.0252254, "epoch": 0.376491808206824, "flos": 34203046515840.0, "grad_norm": 1.7092523183873836, "language_loss": 0.80223846, "learning_rate": 2.757068959172704e-06, "loss": 0.82333195, "num_input_tokens_seen": 134422875, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.53125, "step": 6262, "time_per_iteration": 2.504891872406006 }, { "auxiliary_loss_clip": 0.01070617, "auxiliary_loss_mlp": 0.01029798, "balance_loss_clip": 1.01481318, "balance_loss_mlp": 1.02237606, "epoch": 0.37655193145949195, "flos": 35184315588480.0, "grad_norm": 2.4167735123511265, "language_loss": 0.79974389, "learning_rate": 2.7567192381590837e-06, "loss": 0.82074803, "num_input_tokens_seen": 134443025, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.48242188, "step": 6263, "time_per_iteration": 3.9764513969421387 }, { "auxiliary_loss_clip": 0.01073939, "auxiliary_loss_mlp": 0.01033742, "balance_loss_clip": 1.0184176, "balance_loss_mlp": 1.02370751, "epoch": 0.3766120547121599, "flos": 16760962667520.0, "grad_norm": 2.864044229868821, "language_loss": 0.79476255, "learning_rate": 2.756369490140101e-06, "loss": 0.81583941, "num_input_tokens_seen": 134460945, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.50390625, "step": 6264, "time_per_iteration": 2.3893048763275146 }, { "auxiliary_loss_clip": 0.01070372, "auxiliary_loss_mlp": 0.01032755, "balance_loss_clip": 1.01699543, "balance_loss_mlp": 1.02184439, "epoch": 0.3766721779648279, "flos": 23549230247040.0, "grad_norm": 1.7936473927642498, "language_loss": 0.73654813, "learning_rate": 2.756019715128236e-06, "loss": 0.75757939, "num_input_tokens_seen": 134480440, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.484375, "step": 6265, "time_per_iteration": 2.4322702884674072 }, { "auxiliary_loss_clip": 0.01072317, "auxiliary_loss_mlp": 0.01030421, "balance_loss_clip": 1.01732612, "balance_loss_mlp": 1.02465034, "epoch": 0.37673230121749585, "flos": 29128001996160.0, "grad_norm": 1.6215015501810288, "language_loss": 0.68630373, "learning_rate": 2.755669913135973e-06, "loss": 0.70733112, "num_input_tokens_seen": 134501110, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.4765625, "step": 6266, "time_per_iteration": 2.470392942428589 }, { "auxiliary_loss_clip": 0.01075878, "auxiliary_loss_mlp": 0.01033552, "balance_loss_clip": 1.01776886, "balance_loss_mlp": 1.02227426, "epoch": 0.3767924244701638, "flos": 28145545937280.0, "grad_norm": 3.0757996640837706, "language_loss": 0.63183129, "learning_rate": 2.755320084175794e-06, "loss": 0.65292561, "num_input_tokens_seen": 134522460, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.53515625, "step": 6267, "time_per_iteration": 3.894496202468872 }, { "auxiliary_loss_clip": 0.01014141, "auxiliary_loss_mlp": 0.01000666, "balance_loss_clip": 0.99904507, "balance_loss_mlp": 1.0044384, "epoch": 0.37685254772283183, "flos": 60794153723520.0, "grad_norm": 0.7232547072605194, "language_loss": 0.5886519, "learning_rate": 2.7549702282601847e-06, "loss": 0.60880005, "num_input_tokens_seen": 134589545, "router_z_loss_clip": 0.01623535, "router_z_loss_mlp": 0.09667969, "step": 6268, "time_per_iteration": 3.177900791168213 }, { "auxiliary_loss_clip": 0.01075304, "auxiliary_loss_mlp": 0.01031828, "balance_loss_clip": 1.01556742, "balance_loss_mlp": 1.02371275, "epoch": 0.3769126709754998, "flos": 26031310467840.0, "grad_norm": 2.5875969271903445, "language_loss": 0.65139586, "learning_rate": 2.7546203454016294e-06, "loss": 0.67246711, "num_input_tokens_seen": 134610550, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.515625, "step": 6269, "time_per_iteration": 2.4589014053344727 }, { "auxiliary_loss_clip": 0.01074055, "auxiliary_loss_mlp": 0.01034448, "balance_loss_clip": 1.01830125, "balance_loss_mlp": 1.02519464, "epoch": 0.37697279422816776, "flos": 23578941680640.0, "grad_norm": 1.7417019894549575, "language_loss": 0.70729268, "learning_rate": 2.7542704356126154e-06, "loss": 0.7283777, "num_input_tokens_seen": 134630485, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.48828125, "step": 6270, "time_per_iteration": 2.4513440132141113 }, { "auxiliary_loss_clip": 0.01012673, "auxiliary_loss_mlp": 0.01001323, "balance_loss_clip": 0.99986249, "balance_loss_mlp": 1.00271273, "epoch": 0.3770329174808357, "flos": 64742550802560.0, "grad_norm": 0.7303118194290956, "language_loss": 0.56058031, "learning_rate": 2.7539204989056295e-06, "loss": 0.58072025, "num_input_tokens_seen": 134693510, "router_z_loss_clip": 0.0145874, "router_z_loss_mlp": 0.09960938, "step": 6271, "time_per_iteration": 3.072298288345337 }, { "auxiliary_loss_clip": 0.01069461, "auxiliary_loss_mlp": 0.01030193, "balance_loss_clip": 1.01455879, "balance_loss_mlp": 1.02062869, "epoch": 0.3770930407335037, "flos": 21834226707840.0, "grad_norm": 6.395244620300128, "language_loss": 0.7960465, "learning_rate": 2.753570535293161e-06, "loss": 0.81704307, "num_input_tokens_seen": 134713115, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.48828125, "step": 6272, "time_per_iteration": 2.4121525287628174 }, { "auxiliary_loss_clip": 0.01070385, "auxiliary_loss_mlp": 0.01031363, "balance_loss_clip": 1.01643252, "balance_loss_mlp": 1.02212548, "epoch": 0.37715316398617166, "flos": 22746786491520.0, "grad_norm": 1.5296451727932296, "language_loss": 0.73967814, "learning_rate": 2.753220544787698e-06, "loss": 0.76069564, "num_input_tokens_seen": 134732635, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.48242188, "step": 6273, "time_per_iteration": 2.4095168113708496 }, { "auxiliary_loss_clip": 0.01072926, "auxiliary_loss_mlp": 0.01033402, "balance_loss_clip": 1.01832223, "balance_loss_mlp": 1.02345395, "epoch": 0.3772132872388396, "flos": 18913637410560.0, "grad_norm": 1.4279827035167507, "language_loss": 0.7181142, "learning_rate": 2.7528705274017315e-06, "loss": 0.73917747, "num_input_tokens_seen": 134750695, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.49414062, "step": 6274, "time_per_iteration": 2.4185595512390137 }, { "auxiliary_loss_clip": 0.01074572, "auxiliary_loss_mlp": 0.01030207, "balance_loss_clip": 1.01608706, "balance_loss_mlp": 1.0242064, "epoch": 0.3772734104915076, "flos": 17345303959680.0, "grad_norm": 1.5912507120991288, "language_loss": 0.83663416, "learning_rate": 2.752520483147752e-06, "loss": 0.85768199, "num_input_tokens_seen": 134768935, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.50390625, "step": 6275, "time_per_iteration": 2.3772263526916504 }, { "auxiliary_loss_clip": 0.01070714, "auxiliary_loss_mlp": 0.01024163, "balance_loss_clip": 1.01109779, "balance_loss_mlp": 1.02379036, "epoch": 0.37733353374417555, "flos": 32341023774720.0, "grad_norm": 1.9142428283276909, "language_loss": 0.75319606, "learning_rate": 2.7521704120382523e-06, "loss": 0.77414483, "num_input_tokens_seen": 134791260, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.46875, "step": 6276, "time_per_iteration": 2.4952826499938965 }, { "auxiliary_loss_clip": 0.01074544, "auxiliary_loss_mlp": 0.01029551, "balance_loss_clip": 1.01324272, "balance_loss_mlp": 1.02363646, "epoch": 0.3773936569968435, "flos": 23359756965120.0, "grad_norm": 2.3124840352635823, "language_loss": 0.85441828, "learning_rate": 2.7518203140857255e-06, "loss": 0.87545919, "num_input_tokens_seen": 134808350, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.5078125, "step": 6277, "time_per_iteration": 2.402688980102539 }, { "auxiliary_loss_clip": 0.01071467, "auxiliary_loss_mlp": 0.01024563, "balance_loss_clip": 1.0106039, "balance_loss_mlp": 1.02440333, "epoch": 0.3774537802495115, "flos": 21465823374720.0, "grad_norm": 1.7312496003702398, "language_loss": 0.78453618, "learning_rate": 2.7514701893026656e-06, "loss": 0.80549651, "num_input_tokens_seen": 134826005, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.47070312, "step": 6278, "time_per_iteration": 2.406852960586548 }, { "auxiliary_loss_clip": 0.01076556, "auxiliary_loss_mlp": 0.010345, "balance_loss_clip": 1.01885355, "balance_loss_mlp": 1.02478933, "epoch": 0.37751390350217945, "flos": 24972534443520.0, "grad_norm": 1.6764196658342847, "language_loss": 0.83040118, "learning_rate": 2.751120037701568e-06, "loss": 0.85151172, "num_input_tokens_seen": 134844995, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.515625, "step": 6279, "time_per_iteration": 2.440063714981079 }, { "auxiliary_loss_clip": 0.01072209, "auxiliary_loss_mlp": 0.01030203, "balance_loss_clip": 1.01636267, "balance_loss_mlp": 1.02347875, "epoch": 0.3775740267548474, "flos": 27817851116160.0, "grad_norm": 2.0566915307246942, "language_loss": 0.7479161, "learning_rate": 2.7507698592949276e-06, "loss": 0.76894021, "num_input_tokens_seen": 134865285, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.48828125, "step": 6280, "time_per_iteration": 2.4522063732147217 }, { "auxiliary_loss_clip": 0.0107077, "auxiliary_loss_mlp": 0.0103151, "balance_loss_clip": 1.01828361, "balance_loss_mlp": 1.02410412, "epoch": 0.3776341500075154, "flos": 22564120924800.0, "grad_norm": 1.4303838571205254, "language_loss": 0.76384938, "learning_rate": 2.750419654095243e-06, "loss": 0.78487211, "num_input_tokens_seen": 134886535, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.46875, "step": 6281, "time_per_iteration": 2.4241533279418945 }, { "auxiliary_loss_clip": 0.01071908, "auxiliary_loss_mlp": 0.01030148, "balance_loss_clip": 1.01518691, "balance_loss_mlp": 1.02314448, "epoch": 0.3776942732601834, "flos": 23076087235200.0, "grad_norm": 1.3792810654890517, "language_loss": 0.84199941, "learning_rate": 2.75006942211501e-06, "loss": 0.86301994, "num_input_tokens_seen": 134907435, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.48828125, "step": 6282, "time_per_iteration": 2.464942693710327 }, { "auxiliary_loss_clip": 0.01071301, "auxiliary_loss_mlp": 0.01028181, "balance_loss_clip": 1.01394749, "balance_loss_mlp": 1.02381754, "epoch": 0.37775439651285136, "flos": 21723377541120.0, "grad_norm": 1.6139905577321032, "language_loss": 0.69656861, "learning_rate": 2.74971916336673e-06, "loss": 0.71756351, "num_input_tokens_seen": 134925360, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.47460938, "step": 6283, "time_per_iteration": 2.400484561920166 }, { "auxiliary_loss_clip": 0.01073656, "auxiliary_loss_mlp": 0.01033042, "balance_loss_clip": 1.01753283, "balance_loss_mlp": 1.02443361, "epoch": 0.37781451976551933, "flos": 23986622160000.0, "grad_norm": 1.6480396498006487, "language_loss": 0.76377159, "learning_rate": 2.7493688778629012e-06, "loss": 0.78483856, "num_input_tokens_seen": 134944205, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.4921875, "step": 6284, "time_per_iteration": 2.422675132751465 }, { "auxiliary_loss_clip": 0.01077091, "auxiliary_loss_mlp": 0.0103539, "balance_loss_clip": 1.01945162, "balance_loss_mlp": 1.02470207, "epoch": 0.3778746430181873, "flos": 13727324787840.0, "grad_norm": 2.566403663663605, "language_loss": 0.85713154, "learning_rate": 2.7490185656160244e-06, "loss": 0.87825632, "num_input_tokens_seen": 134960255, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.5234375, "step": 6285, "time_per_iteration": 2.3761963844299316 }, { "auxiliary_loss_clip": 0.01074062, "auxiliary_loss_mlp": 0.01031416, "balance_loss_clip": 1.01485157, "balance_loss_mlp": 1.02363992, "epoch": 0.37793476627085526, "flos": 19459574340480.0, "grad_norm": 2.094610939693785, "language_loss": 0.84092492, "learning_rate": 2.7486682266386025e-06, "loss": 0.86197972, "num_input_tokens_seen": 134978605, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.50390625, "step": 6286, "time_per_iteration": 2.3991265296936035 }, { "auxiliary_loss_clip": 0.01070327, "auxiliary_loss_mlp": 0.0102899, "balance_loss_clip": 1.01445818, "balance_loss_mlp": 1.02181339, "epoch": 0.3779948895235232, "flos": 10706254997760.0, "grad_norm": 1.9934881542143204, "language_loss": 0.82329166, "learning_rate": 2.748317860943137e-06, "loss": 0.84428483, "num_input_tokens_seen": 134995020, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.484375, "step": 6287, "time_per_iteration": 2.378243923187256 }, { "auxiliary_loss_clip": 0.01072337, "auxiliary_loss_mlp": 0.01033723, "balance_loss_clip": 1.01854157, "balance_loss_mlp": 1.02278781, "epoch": 0.3780550127761912, "flos": 22308905819520.0, "grad_norm": 2.230633692956723, "language_loss": 0.73443872, "learning_rate": 2.747967468542132e-06, "loss": 0.75549936, "num_input_tokens_seen": 135012620, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.49609375, "step": 6288, "time_per_iteration": 2.41206693649292 }, { "auxiliary_loss_clip": 0.01074087, "auxiliary_loss_mlp": 0.01029323, "balance_loss_clip": 1.01541138, "balance_loss_mlp": 1.02432442, "epoch": 0.37811513602885916, "flos": 28949351235840.0, "grad_norm": 1.5667994640064582, "language_loss": 0.75021863, "learning_rate": 2.7476170494480915e-06, "loss": 0.77125275, "num_input_tokens_seen": 135033365, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.49609375, "step": 6289, "time_per_iteration": 2.455432891845703 }, { "auxiliary_loss_clip": 0.01072542, "auxiliary_loss_mlp": 0.01030923, "balance_loss_clip": 1.01656473, "balance_loss_mlp": 1.02365494, "epoch": 0.3781752592815271, "flos": 23111803422720.0, "grad_norm": 1.979789191022597, "language_loss": 0.739528, "learning_rate": 2.7472666036735225e-06, "loss": 0.76056266, "num_input_tokens_seen": 135052185, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.48828125, "step": 6290, "time_per_iteration": 2.4248533248901367 }, { "auxiliary_loss_clip": 0.01073788, "auxiliary_loss_mlp": 0.01033367, "balance_loss_clip": 1.0163914, "balance_loss_mlp": 1.02196646, "epoch": 0.3782353825341951, "flos": 19754904464640.0, "grad_norm": 1.9777769257529845, "language_loss": 0.79205358, "learning_rate": 2.74691613123093e-06, "loss": 0.81312513, "num_input_tokens_seen": 135070425, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.51953125, "step": 6291, "time_per_iteration": 2.3897156715393066 }, { "auxiliary_loss_clip": 0.01074113, "auxiliary_loss_mlp": 0.01031127, "balance_loss_clip": 1.01504517, "balance_loss_mlp": 1.0225873, "epoch": 0.37829550578686305, "flos": 22049850464640.0, "grad_norm": 1.7934932271908763, "language_loss": 0.7603035, "learning_rate": 2.746565632132822e-06, "loss": 0.78135592, "num_input_tokens_seen": 135090525, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.515625, "step": 6292, "time_per_iteration": 2.4194183349609375 }, { "auxiliary_loss_clip": 0.01074045, "auxiliary_loss_mlp": 0.01037065, "balance_loss_clip": 1.02020836, "balance_loss_mlp": 1.02358246, "epoch": 0.378355629039531, "flos": 16469472792960.0, "grad_norm": 1.6865967932240056, "language_loss": 0.69078517, "learning_rate": 2.746215106391707e-06, "loss": 0.71189624, "num_input_tokens_seen": 135109575, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.50390625, "step": 6293, "time_per_iteration": 2.408169984817505 }, { "auxiliary_loss_clip": 0.01072111, "auxiliary_loss_mlp": 0.0103168, "balance_loss_clip": 1.01699996, "balance_loss_mlp": 1.02243948, "epoch": 0.378415752292199, "flos": 19973809889280.0, "grad_norm": 1.7124043676158545, "language_loss": 0.71046495, "learning_rate": 2.745864554020095e-06, "loss": 0.73150283, "num_input_tokens_seen": 135127000, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.49804688, "step": 6294, "time_per_iteration": 2.3941314220428467 }, { "auxiliary_loss_clip": 0.01077332, "auxiliary_loss_mlp": 0.01030504, "balance_loss_clip": 1.01368988, "balance_loss_mlp": 1.02372241, "epoch": 0.378475875544867, "flos": 14646517729920.0, "grad_norm": 1.936339766769601, "language_loss": 0.82476676, "learning_rate": 2.7455139750304947e-06, "loss": 0.8458451, "num_input_tokens_seen": 135145285, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.5390625, "step": 6295, "time_per_iteration": 2.3830618858337402 }, { "auxiliary_loss_clip": 0.01072647, "auxiliary_loss_mlp": 0.01029392, "balance_loss_clip": 1.0137043, "balance_loss_mlp": 1.02266371, "epoch": 0.37853599879753497, "flos": 26649796936320.0, "grad_norm": 1.8331920370295154, "language_loss": 0.71501195, "learning_rate": 2.7451633694354194e-06, "loss": 0.73603237, "num_input_tokens_seen": 135165240, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.5, "step": 6296, "time_per_iteration": 3.8663713932037354 }, { "auxiliary_loss_clip": 0.0107309, "auxiliary_loss_mlp": 0.01035425, "balance_loss_clip": 1.02033329, "balance_loss_mlp": 1.02387309, "epoch": 0.37859612205020293, "flos": 17310984226560.0, "grad_norm": 1.923398765396373, "language_loss": 0.76767504, "learning_rate": 2.7448127372473793e-06, "loss": 0.78876019, "num_input_tokens_seen": 135184045, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.4921875, "step": 6297, "time_per_iteration": 2.374267816543579 }, { "auxiliary_loss_clip": 0.01073937, "auxiliary_loss_mlp": 0.0103051, "balance_loss_clip": 1.01524556, "balance_loss_mlp": 1.02417874, "epoch": 0.3786562453028709, "flos": 18219494292480.0, "grad_norm": 1.8864093205172274, "language_loss": 0.79092687, "learning_rate": 2.7444620784788887e-06, "loss": 0.81197131, "num_input_tokens_seen": 135202365, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.49609375, "step": 6298, "time_per_iteration": 2.3878748416900635 }, { "auxiliary_loss_clip": 0.01070902, "auxiliary_loss_mlp": 0.01028893, "balance_loss_clip": 1.01406288, "balance_loss_mlp": 1.02275205, "epoch": 0.37871636855553886, "flos": 21213820114560.0, "grad_norm": 1.4950767241682503, "language_loss": 0.84254462, "learning_rate": 2.744111393142462e-06, "loss": 0.86354256, "num_input_tokens_seen": 135220955, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.48242188, "step": 6299, "time_per_iteration": 2.3771533966064453 }, { "auxiliary_loss_clip": 0.01073882, "auxiliary_loss_mlp": 0.0103219, "balance_loss_clip": 1.01631761, "balance_loss_mlp": 1.02298355, "epoch": 0.3787764918082068, "flos": 20951867116800.0, "grad_norm": 2.274343151011981, "language_loss": 0.76291323, "learning_rate": 2.743760681250613e-06, "loss": 0.78397393, "num_input_tokens_seen": 135239715, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.5078125, "step": 6300, "time_per_iteration": 2.395735025405884 }, { "auxiliary_loss_clip": 0.01076958, "auxiliary_loss_mlp": 0.01036494, "balance_loss_clip": 1.01796937, "balance_loss_mlp": 1.02357352, "epoch": 0.3788366150608748, "flos": 17307143976960.0, "grad_norm": 2.000230347571311, "language_loss": 0.82237911, "learning_rate": 2.743409942815859e-06, "loss": 0.84351361, "num_input_tokens_seen": 135257035, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.53515625, "step": 6301, "time_per_iteration": 3.9260175228118896 }, { "auxiliary_loss_clip": 0.01072777, "auxiliary_loss_mlp": 0.01031796, "balance_loss_clip": 1.01597679, "balance_loss_mlp": 1.0223999, "epoch": 0.37889673831354276, "flos": 24310092706560.0, "grad_norm": 1.7917661272143974, "language_loss": 0.67988241, "learning_rate": 2.743059177850716e-06, "loss": 0.70092809, "num_input_tokens_seen": 135275720, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.50390625, "step": 6302, "time_per_iteration": 2.4174301624298096 }, { "auxiliary_loss_clip": 0.01074916, "auxiliary_loss_mlp": 0.01027897, "balance_loss_clip": 1.01318622, "balance_loss_mlp": 1.02632046, "epoch": 0.3789568615662107, "flos": 26682510746880.0, "grad_norm": 1.8529723406615421, "language_loss": 0.68552291, "learning_rate": 2.7427083863677035e-06, "loss": 0.70655102, "num_input_tokens_seen": 135294140, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.48632812, "step": 6303, "time_per_iteration": 3.881007194519043 }, { "auxiliary_loss_clip": 0.01070759, "auxiliary_loss_mlp": 0.01025119, "balance_loss_clip": 1.01091516, "balance_loss_mlp": 1.02199697, "epoch": 0.3790169848188787, "flos": 23584108561920.0, "grad_norm": 1.6044972558547639, "language_loss": 0.77564681, "learning_rate": 2.742357568379338e-06, "loss": 0.79660559, "num_input_tokens_seen": 135314845, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.48828125, "step": 6304, "time_per_iteration": 2.4242565631866455 }, { "auxiliary_loss_clip": 0.01076963, "auxiliary_loss_mlp": 0.0102708, "balance_loss_clip": 1.01080799, "balance_loss_mlp": 1.02434444, "epoch": 0.37907710807154665, "flos": 18436584326400.0, "grad_norm": 2.16826351294024, "language_loss": 0.80214989, "learning_rate": 2.7420067238981405e-06, "loss": 0.82319027, "num_input_tokens_seen": 135333055, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.52734375, "step": 6305, "time_per_iteration": 2.389779567718506 }, { "auxiliary_loss_clip": 0.01011544, "auxiliary_loss_mlp": 0.01006682, "balance_loss_clip": 1.005198, "balance_loss_mlp": 1.00198126, "epoch": 0.3791372313242146, "flos": 50104411799040.0, "grad_norm": 0.9611161886767963, "language_loss": 0.64461195, "learning_rate": 2.741655852936632e-06, "loss": 0.66479421, "num_input_tokens_seen": 135387865, "router_z_loss_clip": 0.01483154, "router_z_loss_mlp": 0.09570312, "step": 6306, "time_per_iteration": 2.95611834526062 }, { "auxiliary_loss_clip": 0.01076049, "auxiliary_loss_mlp": 0.01040722, "balance_loss_clip": 1.02399743, "balance_loss_mlp": 1.0246563, "epoch": 0.3791973545768826, "flos": 24315399233280.0, "grad_norm": 1.4776211144927904, "language_loss": 0.73504448, "learning_rate": 2.741304955507334e-06, "loss": 0.75621223, "num_input_tokens_seen": 135409095, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.515625, "step": 6307, "time_per_iteration": 3.84859561920166 }, { "auxiliary_loss_clip": 0.01076621, "auxiliary_loss_mlp": 0.01036385, "balance_loss_clip": 1.02015483, "balance_loss_mlp": 1.02474093, "epoch": 0.3792574778295506, "flos": 21578837045760.0, "grad_norm": 1.5295517112865868, "language_loss": 0.78396779, "learning_rate": 2.7409540316227686e-06, "loss": 0.80509782, "num_input_tokens_seen": 135429585, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.51953125, "step": 6308, "time_per_iteration": 2.4184396266937256 }, { "auxiliary_loss_clip": 0.01070955, "auxiliary_loss_mlp": 0.01029698, "balance_loss_clip": 1.01374197, "balance_loss_mlp": 1.02192664, "epoch": 0.37931760108221857, "flos": 22271653532160.0, "grad_norm": 3.2319191580634827, "language_loss": 0.73008668, "learning_rate": 2.7406030812954596e-06, "loss": 0.75109327, "num_input_tokens_seen": 135446320, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.49023438, "step": 6309, "time_per_iteration": 2.3869950771331787 }, { "auxiliary_loss_clip": 0.01072777, "auxiliary_loss_mlp": 0.01030266, "balance_loss_clip": 1.01474452, "balance_loss_mlp": 1.02375412, "epoch": 0.37937772433488653, "flos": 19681970901120.0, "grad_norm": 1.4278587239728795, "language_loss": 0.78796041, "learning_rate": 2.740252104537932e-06, "loss": 0.8089909, "num_input_tokens_seen": 135465720, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.4921875, "step": 6310, "time_per_iteration": 2.410391092300415 }, { "auxiliary_loss_clip": 0.01073416, "auxiliary_loss_mlp": 0.01028842, "balance_loss_clip": 1.01381028, "balance_loss_mlp": 1.02311683, "epoch": 0.3794378475875545, "flos": 19098362747520.0, "grad_norm": 1.935800809432045, "language_loss": 0.76142395, "learning_rate": 2.7399011013627112e-06, "loss": 0.78244656, "num_input_tokens_seen": 135485155, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.50390625, "step": 6311, "time_per_iteration": 2.391130208969116 }, { "auxiliary_loss_clip": 0.01073151, "auxiliary_loss_mlp": 0.01031542, "balance_loss_clip": 1.01680803, "balance_loss_mlp": 1.0238483, "epoch": 0.37949797084022246, "flos": 20338617352320.0, "grad_norm": 1.6251359330124207, "language_loss": 0.70789325, "learning_rate": 2.7395500717823233e-06, "loss": 0.72894013, "num_input_tokens_seen": 135502675, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.4921875, "step": 6312, "time_per_iteration": 2.395550489425659 }, { "auxiliary_loss_clip": 0.01011316, "auxiliary_loss_mlp": 0.01000873, "balance_loss_clip": 0.99937081, "balance_loss_mlp": 1.00167561, "epoch": 0.37955809409289043, "flos": 63969050430720.0, "grad_norm": 0.7845213858642633, "language_loss": 0.56086898, "learning_rate": 2.739199015809296e-06, "loss": 0.58099091, "num_input_tokens_seen": 135562005, "router_z_loss_clip": 0.01501465, "router_z_loss_mlp": 0.09667969, "step": 6313, "time_per_iteration": 3.005631923675537 }, { "auxiliary_loss_clip": 0.01072487, "auxiliary_loss_mlp": 0.01034646, "balance_loss_clip": 1.01942253, "balance_loss_mlp": 1.0229528, "epoch": 0.3796182173455584, "flos": 31539313157760.0, "grad_norm": 1.9726428742886393, "language_loss": 0.71231735, "learning_rate": 2.738847933456156e-06, "loss": 0.73338866, "num_input_tokens_seen": 135582600, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.49609375, "step": 6314, "time_per_iteration": 2.5943915843963623 }, { "auxiliary_loss_clip": 0.0107584, "auxiliary_loss_mlp": 0.01032347, "balance_loss_clip": 1.01571691, "balance_loss_mlp": 1.02327943, "epoch": 0.37967834059822636, "flos": 12129978130560.0, "grad_norm": 1.7401836139182052, "language_loss": 0.73164737, "learning_rate": 2.738496824735435e-06, "loss": 0.75272924, "num_input_tokens_seen": 135600280, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.5234375, "step": 6315, "time_per_iteration": 2.3773343563079834 }, { "auxiliary_loss_clip": 0.01074051, "auxiliary_loss_mlp": 0.01033628, "balance_loss_clip": 1.01839304, "balance_loss_mlp": 1.02460265, "epoch": 0.3797384638508943, "flos": 39347009792640.0, "grad_norm": 1.8086305695487093, "language_loss": 0.70935374, "learning_rate": 2.738145689659661e-06, "loss": 0.73043054, "num_input_tokens_seen": 135621560, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.49414062, "step": 6316, "time_per_iteration": 2.5793564319610596 }, { "auxiliary_loss_clip": 0.01072395, "auxiliary_loss_mlp": 0.01031413, "balance_loss_clip": 1.01746511, "balance_loss_mlp": 1.02356768, "epoch": 0.3797985871035623, "flos": 34052710734720.0, "grad_norm": 2.327006941177403, "language_loss": 0.65011269, "learning_rate": 2.737794528241367e-06, "loss": 0.67115074, "num_input_tokens_seen": 135641745, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.48828125, "step": 6317, "time_per_iteration": 2.5013041496276855 }, { "auxiliary_loss_clip": 0.0107017, "auxiliary_loss_mlp": 0.01026863, "balance_loss_clip": 1.01313019, "balance_loss_mlp": 1.02251899, "epoch": 0.37985871035623026, "flos": 23221046666880.0, "grad_norm": 2.5560401766138963, "language_loss": 0.84915054, "learning_rate": 2.737443340493084e-06, "loss": 0.87012076, "num_input_tokens_seen": 135660650, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.4765625, "step": 6318, "time_per_iteration": 2.42867374420166 }, { "auxiliary_loss_clip": 0.010729, "auxiliary_loss_mlp": 0.01034103, "balance_loss_clip": 1.01797414, "balance_loss_mlp": 1.02248192, "epoch": 0.3799188336088982, "flos": 18113951652480.0, "grad_norm": 2.073614079920041, "language_loss": 0.76291788, "learning_rate": 2.737092126427345e-06, "loss": 0.78398788, "num_input_tokens_seen": 135679980, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.50390625, "step": 6319, "time_per_iteration": 2.3749430179595947 }, { "auxiliary_loss_clip": 0.01073521, "auxiliary_loss_mlp": 0.01035917, "balance_loss_clip": 1.02171898, "balance_loss_mlp": 1.02473748, "epoch": 0.3799789568615662, "flos": 21870815679360.0, "grad_norm": 1.8058167086776955, "language_loss": 0.64246219, "learning_rate": 2.736740886056684e-06, "loss": 0.66355658, "num_input_tokens_seen": 135699400, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.48632812, "step": 6320, "time_per_iteration": 2.43791127204895 }, { "auxiliary_loss_clip": 0.01072692, "auxiliary_loss_mlp": 0.01035849, "balance_loss_clip": 1.0207696, "balance_loss_mlp": 1.02388418, "epoch": 0.3800390801142342, "flos": 32961570013440.0, "grad_norm": 1.6897334924965235, "language_loss": 0.70931941, "learning_rate": 2.7363896193936356e-06, "loss": 0.73040479, "num_input_tokens_seen": 135723455, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.48828125, "step": 6321, "time_per_iteration": 2.504054546356201 }, { "auxiliary_loss_clip": 0.01073745, "auxiliary_loss_mlp": 0.01028804, "balance_loss_clip": 1.01347327, "balance_loss_mlp": 1.02246118, "epoch": 0.38009920336690217, "flos": 26905849914240.0, "grad_norm": 1.8903169460824751, "language_loss": 0.74813843, "learning_rate": 2.7360383264507364e-06, "loss": 0.76916397, "num_input_tokens_seen": 135744335, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.51171875, "step": 6322, "time_per_iteration": 2.4458911418914795 }, { "auxiliary_loss_clip": 0.01071188, "auxiliary_loss_mlp": 0.01032696, "balance_loss_clip": 1.01799774, "balance_loss_mlp": 1.02290261, "epoch": 0.38015932661957014, "flos": 22487905693440.0, "grad_norm": 2.0829829938278053, "language_loss": 0.85222924, "learning_rate": 2.735687007240522e-06, "loss": 0.87326807, "num_input_tokens_seen": 135761440, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.48242188, "step": 6323, "time_per_iteration": 2.3958194255828857 }, { "auxiliary_loss_clip": 0.01073985, "auxiliary_loss_mlp": 0.01031721, "balance_loss_clip": 1.01593733, "balance_loss_mlp": 1.0234499, "epoch": 0.3802194498722381, "flos": 21979919278080.0, "grad_norm": 2.4657232571983982, "language_loss": 0.73532248, "learning_rate": 2.735335661775531e-06, "loss": 0.75637954, "num_input_tokens_seen": 135779955, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.5078125, "step": 6324, "time_per_iteration": 2.4065213203430176 }, { "auxiliary_loss_clip": 0.01075219, "auxiliary_loss_mlp": 0.01033289, "balance_loss_clip": 1.01786327, "balance_loss_mlp": 1.02478135, "epoch": 0.38027957312490607, "flos": 21323796497280.0, "grad_norm": 1.850138936990718, "language_loss": 0.84612614, "learning_rate": 2.734984290068302e-06, "loss": 0.86721122, "num_input_tokens_seen": 135799840, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.50390625, "step": 6325, "time_per_iteration": 2.402581214904785 }, { "auxiliary_loss_clip": 0.01073272, "auxiliary_loss_mlp": 0.01029424, "balance_loss_clip": 1.01516056, "balance_loss_mlp": 1.02385521, "epoch": 0.38033969637757403, "flos": 16690298342400.0, "grad_norm": 2.2045871825243117, "language_loss": 0.79654181, "learning_rate": 2.734632892131374e-06, "loss": 0.81756878, "num_input_tokens_seen": 135817880, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.4921875, "step": 6326, "time_per_iteration": 2.3796725273132324 }, { "auxiliary_loss_clip": 0.0107083, "auxiliary_loss_mlp": 0.01029436, "balance_loss_clip": 1.01546466, "balance_loss_mlp": 1.0218854, "epoch": 0.380399819630242, "flos": 36209365372800.0, "grad_norm": 2.4240213340391743, "language_loss": 0.73113871, "learning_rate": 2.734281467977288e-06, "loss": 0.75214136, "num_input_tokens_seen": 135838940, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.48828125, "step": 6327, "time_per_iteration": 2.565790891647339 }, { "auxiliary_loss_clip": 0.01071873, "auxiliary_loss_mlp": 0.01033757, "balance_loss_clip": 1.01784277, "balance_loss_mlp": 1.02409625, "epoch": 0.38045994288290996, "flos": 21287766107520.0, "grad_norm": 1.494018950564544, "language_loss": 0.83084941, "learning_rate": 2.733930017618585e-06, "loss": 0.8519057, "num_input_tokens_seen": 135858325, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.4765625, "step": 6328, "time_per_iteration": 2.3930253982543945 }, { "auxiliary_loss_clip": 0.01069988, "auxiliary_loss_mlp": 0.01029184, "balance_loss_clip": 1.01433659, "balance_loss_mlp": 1.02169073, "epoch": 0.38052006613557793, "flos": 20921841480960.0, "grad_norm": 1.4257515592237942, "language_loss": 0.61200964, "learning_rate": 2.733578541067808e-06, "loss": 0.63300145, "num_input_tokens_seen": 135878430, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.484375, "step": 6329, "time_per_iteration": 2.41375732421875 }, { "auxiliary_loss_clip": 0.01072631, "auxiliary_loss_mlp": 0.01033787, "balance_loss_clip": 1.01839685, "balance_loss_mlp": 1.02264071, "epoch": 0.3805801893882459, "flos": 20989817631360.0, "grad_norm": 2.8991918272289494, "language_loss": 0.56211734, "learning_rate": 2.733227038337499e-06, "loss": 0.58318144, "num_input_tokens_seen": 135894755, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.5, "step": 6330, "time_per_iteration": 2.3851916790008545 }, { "auxiliary_loss_clip": 0.01070925, "auxiliary_loss_mlp": 0.01027293, "balance_loss_clip": 1.01420426, "balance_loss_mlp": 1.02431893, "epoch": 0.38064031264091386, "flos": 25557364494720.0, "grad_norm": 2.134191527011971, "language_loss": 0.65927303, "learning_rate": 2.7328755094402036e-06, "loss": 0.68025517, "num_input_tokens_seen": 135918275, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.46484375, "step": 6331, "time_per_iteration": 2.479475498199463 }, { "auxiliary_loss_clip": 0.01073984, "auxiliary_loss_mlp": 0.0104059, "balance_loss_clip": 1.02493167, "balance_loss_mlp": 1.0249294, "epoch": 0.3807004358935818, "flos": 15084956983680.0, "grad_norm": 1.6180832428334229, "language_loss": 0.75569254, "learning_rate": 2.732523954388466e-06, "loss": 0.7768383, "num_input_tokens_seen": 135937430, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.49023438, "step": 6332, "time_per_iteration": 2.3726158142089844 }, { "auxiliary_loss_clip": 0.01072455, "auxiliary_loss_mlp": 0.01029656, "balance_loss_clip": 1.01440954, "balance_loss_mlp": 1.0220623, "epoch": 0.3807605591462498, "flos": 16398459354240.0, "grad_norm": 2.0739452714131534, "language_loss": 0.82257962, "learning_rate": 2.732172373194834e-06, "loss": 0.84360069, "num_input_tokens_seen": 135954210, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.50390625, "step": 6333, "time_per_iteration": 2.3743832111358643 }, { "auxiliary_loss_clip": 0.01069972, "auxiliary_loss_mlp": 0.01027534, "balance_loss_clip": 1.01314545, "balance_loss_mlp": 1.02167153, "epoch": 0.3808206823989178, "flos": 29055871393920.0, "grad_norm": 1.5541221219981671, "language_loss": 0.86296082, "learning_rate": 2.731820765871853e-06, "loss": 0.88393587, "num_input_tokens_seen": 135974425, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.48242188, "step": 6334, "time_per_iteration": 2.452577590942383 }, { "auxiliary_loss_clip": 0.01071537, "auxiliary_loss_mlp": 0.0103303, "balance_loss_clip": 1.01790214, "balance_loss_mlp": 1.02261972, "epoch": 0.3808808056515858, "flos": 15704944640640.0, "grad_norm": 7.015227393238082, "language_loss": 0.79345757, "learning_rate": 2.7314691324320705e-06, "loss": 0.81450319, "num_input_tokens_seen": 135991985, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.48828125, "step": 6335, "time_per_iteration": 2.3867104053497314 }, { "auxiliary_loss_clip": 0.01073225, "auxiliary_loss_mlp": 0.0103004, "balance_loss_clip": 1.01421452, "balance_loss_mlp": 1.02300286, "epoch": 0.38094092890425374, "flos": 20703529549440.0, "grad_norm": 2.5870340626875112, "language_loss": 0.72556508, "learning_rate": 2.7311174728880364e-06, "loss": 0.74659771, "num_input_tokens_seen": 136010015, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.5, "step": 6336, "time_per_iteration": 3.906203508377075 }, { "auxiliary_loss_clip": 0.01070224, "auxiliary_loss_mlp": 0.01029884, "balance_loss_clip": 1.01577568, "balance_loss_mlp": 1.0226469, "epoch": 0.3810010521569217, "flos": 20666905666560.0, "grad_norm": 1.8096865588641151, "language_loss": 0.69679999, "learning_rate": 2.730765787252301e-06, "loss": 0.71780109, "num_input_tokens_seen": 136028440, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.4765625, "step": 6337, "time_per_iteration": 2.3771214485168457 }, { "auxiliary_loss_clip": 0.0107268, "auxiliary_loss_mlp": 0.01029277, "balance_loss_clip": 1.01425695, "balance_loss_mlp": 1.0236088, "epoch": 0.38106117540958967, "flos": 31826404200960.0, "grad_norm": 2.406627064099211, "language_loss": 0.63582182, "learning_rate": 2.7304140755374137e-06, "loss": 0.65684134, "num_input_tokens_seen": 136048360, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.49023438, "step": 6338, "time_per_iteration": 2.477311611175537 }, { "auxiliary_loss_clip": 0.01072969, "auxiliary_loss_mlp": 0.01031572, "balance_loss_clip": 1.01634908, "balance_loss_mlp": 1.02388, "epoch": 0.38112129866225763, "flos": 16902012026880.0, "grad_norm": 2.919416394949127, "language_loss": 0.69507802, "learning_rate": 2.7300623377559273e-06, "loss": 0.71612334, "num_input_tokens_seen": 136065500, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.4921875, "step": 6339, "time_per_iteration": 2.360309600830078 }, { "auxiliary_loss_clip": 0.0107491, "auxiliary_loss_mlp": 0.01036389, "balance_loss_clip": 1.02217925, "balance_loss_mlp": 1.02416372, "epoch": 0.3811814219149256, "flos": 20886160204800.0, "grad_norm": 3.2329403842978155, "language_loss": 0.68271172, "learning_rate": 2.729710573920394e-06, "loss": 0.70382476, "num_input_tokens_seen": 136084060, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.5078125, "step": 6340, "time_per_iteration": 2.4211418628692627 }, { "auxiliary_loss_clip": 0.0107451, "auxiliary_loss_mlp": 0.01031814, "balance_loss_clip": 1.01581609, "balance_loss_mlp": 1.0231539, "epoch": 0.38124154516759357, "flos": 16689879406080.0, "grad_norm": 1.94844788594245, "language_loss": 0.89741421, "learning_rate": 2.729358784043367e-06, "loss": 0.91847742, "num_input_tokens_seen": 136102310, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.515625, "step": 6341, "time_per_iteration": 3.7907562255859375 }, { "auxiliary_loss_clip": 0.01075309, "auxiliary_loss_mlp": 0.01033236, "balance_loss_clip": 1.01729822, "balance_loss_mlp": 1.02402592, "epoch": 0.38130166842026153, "flos": 19680958471680.0, "grad_norm": 1.6311095392505879, "language_loss": 0.75363111, "learning_rate": 2.7290069681374018e-06, "loss": 0.77471656, "num_input_tokens_seen": 136120725, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.515625, "step": 6342, "time_per_iteration": 2.39212703704834 }, { "auxiliary_loss_clip": 0.01072937, "auxiliary_loss_mlp": 0.01031331, "balance_loss_clip": 1.01666236, "balance_loss_mlp": 1.0224123, "epoch": 0.3813617916729295, "flos": 22197393336960.0, "grad_norm": 1.6209662581670443, "language_loss": 0.83417922, "learning_rate": 2.7286551262150522e-06, "loss": 0.85522187, "num_input_tokens_seen": 136139105, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.50390625, "step": 6343, "time_per_iteration": 3.7917282581329346 }, { "auxiliary_loss_clip": 0.01070836, "auxiliary_loss_mlp": 0.01032255, "balance_loss_clip": 1.01746726, "balance_loss_mlp": 1.02193069, "epoch": 0.38142191492559746, "flos": 19095953863680.0, "grad_norm": 1.6678018005756956, "language_loss": 0.76731622, "learning_rate": 2.7283032582888763e-06, "loss": 0.78834707, "num_input_tokens_seen": 136158265, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.48828125, "step": 6344, "time_per_iteration": 2.38700008392334 }, { "auxiliary_loss_clip": 0.01075757, "auxiliary_loss_mlp": 0.01032631, "balance_loss_clip": 1.01753259, "balance_loss_mlp": 1.02496266, "epoch": 0.3814820381782654, "flos": 24096598542720.0, "grad_norm": 2.4158382563911376, "language_loss": 0.73242879, "learning_rate": 2.7279513643714304e-06, "loss": 0.75351268, "num_input_tokens_seen": 136176100, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.5078125, "step": 6345, "time_per_iteration": 2.4203360080718994 }, { "auxiliary_loss_clip": 0.01070512, "auxiliary_loss_mlp": 0.0102623, "balance_loss_clip": 1.01216364, "balance_loss_mlp": 1.02177739, "epoch": 0.3815421614309334, "flos": 15777598913280.0, "grad_norm": 1.6191840928302095, "language_loss": 0.69535041, "learning_rate": 2.727599444475272e-06, "loss": 0.71631777, "num_input_tokens_seen": 136195125, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.48828125, "step": 6346, "time_per_iteration": 3.7785873413085938 }, { "auxiliary_loss_clip": 0.01073276, "auxiliary_loss_mlp": 0.01029892, "balance_loss_clip": 1.01497293, "balance_loss_mlp": 1.02398419, "epoch": 0.38160228468360136, "flos": 19898781644160.0, "grad_norm": 1.7791035163357487, "language_loss": 0.74928927, "learning_rate": 2.7272474986129622e-06, "loss": 0.77032089, "num_input_tokens_seen": 136213885, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.4921875, "step": 6347, "time_per_iteration": 2.3902587890625 }, { "auxiliary_loss_clip": 0.01071621, "auxiliary_loss_mlp": 0.01030149, "balance_loss_clip": 1.01610613, "balance_loss_mlp": 1.02165222, "epoch": 0.3816624079362694, "flos": 19280050796160.0, "grad_norm": 3.775672963559628, "language_loss": 0.74341285, "learning_rate": 2.7268955267970594e-06, "loss": 0.76443052, "num_input_tokens_seen": 136232700, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.5, "step": 6348, "time_per_iteration": 2.368314743041992 }, { "auxiliary_loss_clip": 0.01070911, "auxiliary_loss_mlp": 0.01027743, "balance_loss_clip": 1.01357532, "balance_loss_mlp": 1.02221847, "epoch": 0.38172253118893734, "flos": 21176532915840.0, "grad_norm": 2.345112785963392, "language_loss": 0.87265092, "learning_rate": 2.726543529040125e-06, "loss": 0.89363742, "num_input_tokens_seen": 136248975, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.48632812, "step": 6349, "time_per_iteration": 2.3773574829101562 }, { "auxiliary_loss_clip": 0.01071461, "auxiliary_loss_mlp": 0.01028906, "balance_loss_clip": 1.01403427, "balance_loss_mlp": 1.02234077, "epoch": 0.3817826544416053, "flos": 17528283728640.0, "grad_norm": 1.6432091634171837, "language_loss": 0.76380199, "learning_rate": 2.7261915053547216e-06, "loss": 0.78480566, "num_input_tokens_seen": 136266710, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.4921875, "step": 6350, "time_per_iteration": 2.348567485809326 }, { "auxiliary_loss_clip": 0.01071636, "auxiliary_loss_mlp": 0.01026879, "balance_loss_clip": 1.01115584, "balance_loss_mlp": 1.02230453, "epoch": 0.38184277769427327, "flos": 16325595613440.0, "grad_norm": 1.9467085013459524, "language_loss": 0.75772917, "learning_rate": 2.7258394557534103e-06, "loss": 0.77871436, "num_input_tokens_seen": 136284445, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.4921875, "step": 6351, "time_per_iteration": 2.3566505908966064 }, { "auxiliary_loss_clip": 0.01074607, "auxiliary_loss_mlp": 0.01032537, "balance_loss_clip": 1.01671815, "balance_loss_mlp": 1.02312076, "epoch": 0.38190290094694124, "flos": 30442202593920.0, "grad_norm": 1.752944037582217, "language_loss": 0.74010742, "learning_rate": 2.725487380248756e-06, "loss": 0.76117879, "num_input_tokens_seen": 136305730, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.515625, "step": 6352, "time_per_iteration": 2.478961944580078 }, { "auxiliary_loss_clip": 0.01069502, "auxiliary_loss_mlp": 0.01025978, "balance_loss_clip": 1.01283479, "balance_loss_mlp": 1.02186477, "epoch": 0.3819630241996092, "flos": 14209055994240.0, "grad_norm": 1.8667879904088551, "language_loss": 0.63989282, "learning_rate": 2.7251352788533237e-06, "loss": 0.66084754, "num_input_tokens_seen": 136323850, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.4765625, "step": 6353, "time_per_iteration": 2.390054941177368 }, { "auxiliary_loss_clip": 0.01069347, "auxiliary_loss_mlp": 0.01028094, "balance_loss_clip": 1.01348531, "balance_loss_mlp": 1.02158535, "epoch": 0.38202314745227717, "flos": 25008529921920.0, "grad_norm": 1.5842716576676679, "language_loss": 0.83193064, "learning_rate": 2.7247831515796786e-06, "loss": 0.85290504, "num_input_tokens_seen": 136344880, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.4765625, "step": 6354, "time_per_iteration": 2.4330496788024902 }, { "auxiliary_loss_clip": 0.01071746, "auxiliary_loss_mlp": 0.01026443, "balance_loss_clip": 1.01256669, "balance_loss_mlp": 1.02348781, "epoch": 0.38208327070494513, "flos": 20813436109440.0, "grad_norm": 1.7118059923719808, "language_loss": 0.80289, "learning_rate": 2.7244309984403865e-06, "loss": 0.82387185, "num_input_tokens_seen": 136366060, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.484375, "step": 6355, "time_per_iteration": 2.4107625484466553 }, { "auxiliary_loss_clip": 0.01072404, "auxiliary_loss_mlp": 0.01029717, "balance_loss_clip": 1.01535213, "balance_loss_mlp": 1.02283466, "epoch": 0.3821433939576131, "flos": 22636635552000.0, "grad_norm": 1.8051748239290533, "language_loss": 0.75453568, "learning_rate": 2.7240788194480163e-06, "loss": 0.77555686, "num_input_tokens_seen": 136385625, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.49609375, "step": 6356, "time_per_iteration": 2.426191806793213 }, { "auxiliary_loss_clip": 0.01071852, "auxiliary_loss_mlp": 0.01027571, "balance_loss_clip": 1.01277745, "balance_loss_mlp": 1.02296412, "epoch": 0.38220351721028106, "flos": 26868667449600.0, "grad_norm": 2.743462649009737, "language_loss": 0.81357545, "learning_rate": 2.7237266146151357e-06, "loss": 0.83456969, "num_input_tokens_seen": 136405750, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.48828125, "step": 6357, "time_per_iteration": 2.4445321559906006 }, { "auxiliary_loss_clip": 0.01077669, "auxiliary_loss_mlp": 0.0103619, "balance_loss_clip": 1.01894021, "balance_loss_mlp": 1.02636087, "epoch": 0.38226364046294903, "flos": 23366355212160.0, "grad_norm": 1.6343022709106763, "language_loss": 0.77832496, "learning_rate": 2.7233743839543135e-06, "loss": 0.79946357, "num_input_tokens_seen": 136426085, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.51171875, "step": 6358, "time_per_iteration": 2.4251880645751953 }, { "auxiliary_loss_clip": 0.01073259, "auxiliary_loss_mlp": 0.01032635, "balance_loss_clip": 1.017591, "balance_loss_mlp": 1.02269793, "epoch": 0.382323763715617, "flos": 19645207372800.0, "grad_norm": 2.192897322384427, "language_loss": 0.79104972, "learning_rate": 2.7230221274781204e-06, "loss": 0.81210864, "num_input_tokens_seen": 136442670, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.5078125, "step": 6359, "time_per_iteration": 2.3775861263275146 }, { "auxiliary_loss_clip": 0.01072341, "auxiliary_loss_mlp": 0.01026915, "balance_loss_clip": 1.01232946, "balance_loss_mlp": 1.02312422, "epoch": 0.38238388696828496, "flos": 54122776842240.0, "grad_norm": 1.8630713747801777, "language_loss": 0.69627303, "learning_rate": 2.722669845199127e-06, "loss": 0.71726561, "num_input_tokens_seen": 136465730, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.4921875, "step": 6360, "time_per_iteration": 2.6730566024780273 }, { "auxiliary_loss_clip": 0.01071773, "auxiliary_loss_mlp": 0.01024345, "balance_loss_clip": 1.00970566, "balance_loss_mlp": 1.02280796, "epoch": 0.382444010220953, "flos": 24935037776640.0, "grad_norm": 1.584770867287631, "language_loss": 0.78851461, "learning_rate": 2.7223175371299062e-06, "loss": 0.80947578, "num_input_tokens_seen": 136487215, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.48828125, "step": 6361, "time_per_iteration": 2.4543378353118896 }, { "auxiliary_loss_clip": 0.01069438, "auxiliary_loss_mlp": 0.01026662, "balance_loss_clip": 1.01302397, "balance_loss_mlp": 1.02275753, "epoch": 0.38250413347362094, "flos": 42335784708480.0, "grad_norm": 1.3613893411603089, "language_loss": 0.65544206, "learning_rate": 2.72196520328303e-06, "loss": 0.67640305, "num_input_tokens_seen": 136510365, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.46679688, "step": 6362, "time_per_iteration": 2.5893757343292236 }, { "auxiliary_loss_clip": 0.01070188, "auxiliary_loss_mlp": 0.01028316, "balance_loss_clip": 1.01351023, "balance_loss_mlp": 1.02255678, "epoch": 0.3825642567262889, "flos": 16288308414720.0, "grad_norm": 1.7003943566148527, "language_loss": 0.8184911, "learning_rate": 2.7216128436710737e-06, "loss": 0.83947611, "num_input_tokens_seen": 136527100, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.4765625, "step": 6363, "time_per_iteration": 2.398024320602417 }, { "auxiliary_loss_clip": 0.01071189, "auxiliary_loss_mlp": 0.01028626, "balance_loss_clip": 1.01458287, "balance_loss_mlp": 1.02377224, "epoch": 0.3826243799789569, "flos": 45653197052160.0, "grad_norm": 1.8400656753131923, "language_loss": 0.58986646, "learning_rate": 2.7212604583066107e-06, "loss": 0.61086464, "num_input_tokens_seen": 136550870, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.47460938, "step": 6364, "time_per_iteration": 2.6245498657226562 }, { "auxiliary_loss_clip": 0.01072472, "auxiliary_loss_mlp": 0.01029269, "balance_loss_clip": 1.0138073, "balance_loss_mlp": 1.02297175, "epoch": 0.38268450323162484, "flos": 25300403821440.0, "grad_norm": 2.5557872774094137, "language_loss": 0.69215167, "learning_rate": 2.7209080472022174e-06, "loss": 0.7131691, "num_input_tokens_seen": 136569895, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.49609375, "step": 6365, "time_per_iteration": 2.4672000408172607 }, { "auxiliary_loss_clip": 0.01072812, "auxiliary_loss_mlp": 0.01028341, "balance_loss_clip": 1.01259947, "balance_loss_mlp": 1.02237844, "epoch": 0.3827446264842928, "flos": 21834924935040.0, "grad_norm": 2.041020458544721, "language_loss": 0.73036033, "learning_rate": 2.72055561037047e-06, "loss": 0.75137186, "num_input_tokens_seen": 136588585, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.50390625, "step": 6366, "time_per_iteration": 2.4111130237579346 }, { "auxiliary_loss_clip": 0.01073342, "auxiliary_loss_mlp": 0.01030665, "balance_loss_clip": 1.01455986, "balance_loss_mlp": 1.02303851, "epoch": 0.38280474973696077, "flos": 25733536548480.0, "grad_norm": 2.2766831018131457, "language_loss": 0.68395281, "learning_rate": 2.720203147823947e-06, "loss": 0.70499289, "num_input_tokens_seen": 136606640, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.50390625, "step": 6367, "time_per_iteration": 2.4468324184417725 }, { "auxiliary_loss_clip": 0.0107014, "auxiliary_loss_mlp": 0.01032627, "balance_loss_clip": 1.01766062, "balance_loss_mlp": 1.0221231, "epoch": 0.38286487298962874, "flos": 24894887846400.0, "grad_norm": 1.9425800311136414, "language_loss": 0.63679582, "learning_rate": 2.719850659575225e-06, "loss": 0.65782344, "num_input_tokens_seen": 136624940, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.48046875, "step": 6368, "time_per_iteration": 2.399765968322754 }, { "auxiliary_loss_clip": 0.01071034, "auxiliary_loss_mlp": 0.01029105, "balance_loss_clip": 1.01450181, "balance_loss_mlp": 1.02185559, "epoch": 0.3829249962422967, "flos": 28542578451840.0, "grad_norm": 1.3072999188014507, "language_loss": 0.68166196, "learning_rate": 2.7194981456368857e-06, "loss": 0.70266342, "num_input_tokens_seen": 136645540, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.4921875, "step": 6369, "time_per_iteration": 2.4642443656921387 }, { "auxiliary_loss_clip": 0.01072015, "auxiliary_loss_mlp": 0.01029282, "balance_loss_clip": 1.01568067, "balance_loss_mlp": 1.02340841, "epoch": 0.38298511949496467, "flos": 21470117472000.0, "grad_norm": 1.6192077323976262, "language_loss": 0.78242338, "learning_rate": 2.719145606021508e-06, "loss": 0.8034364, "num_input_tokens_seen": 136664530, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.484375, "step": 6370, "time_per_iteration": 2.387908935546875 }, { "auxiliary_loss_clip": 0.01071998, "auxiliary_loss_mlp": 0.01031595, "balance_loss_clip": 1.01696777, "balance_loss_mlp": 1.02367973, "epoch": 0.38304524274763263, "flos": 31678826417280.0, "grad_norm": 3.2585058839607592, "language_loss": 0.6452136, "learning_rate": 2.7187930407416738e-06, "loss": 0.66624951, "num_input_tokens_seen": 136682315, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.48242188, "step": 6371, "time_per_iteration": 2.4578778743743896 }, { "auxiliary_loss_clip": 0.01074473, "auxiliary_loss_mlp": 0.01030038, "balance_loss_clip": 1.01361084, "balance_loss_mlp": 1.02350307, "epoch": 0.3831053660003006, "flos": 25075807845120.0, "grad_norm": 1.9348233168684754, "language_loss": 0.72772771, "learning_rate": 2.7184404498099644e-06, "loss": 0.74877286, "num_input_tokens_seen": 136701185, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.5078125, "step": 6372, "time_per_iteration": 2.4492855072021484 }, { "auxiliary_loss_clip": 0.0107258, "auxiliary_loss_mlp": 0.01032563, "balance_loss_clip": 1.01692247, "balance_loss_mlp": 1.02249813, "epoch": 0.38316548925296856, "flos": 23257880017920.0, "grad_norm": 1.7487877720047569, "language_loss": 0.847821, "learning_rate": 2.7180878332389638e-06, "loss": 0.86887246, "num_input_tokens_seen": 136721265, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.5, "step": 6373, "time_per_iteration": 2.397027015686035 }, { "auxiliary_loss_clip": 0.01075996, "auxiliary_loss_mlp": 0.01036034, "balance_loss_clip": 1.02008963, "balance_loss_mlp": 1.0246737, "epoch": 0.3832256125056366, "flos": 34422021763200.0, "grad_norm": 2.0572916232565457, "language_loss": 0.74710399, "learning_rate": 2.7177351910412553e-06, "loss": 0.76822436, "num_input_tokens_seen": 136741885, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.51171875, "step": 6374, "time_per_iteration": 2.4963696002960205 }, { "auxiliary_loss_clip": 0.01075414, "auxiliary_loss_mlp": 0.01031086, "balance_loss_clip": 1.01593471, "balance_loss_mlp": 1.02460217, "epoch": 0.38328573575830455, "flos": 21761677169280.0, "grad_norm": 2.384889744945214, "language_loss": 0.76147139, "learning_rate": 2.717382523229424e-06, "loss": 0.78253639, "num_input_tokens_seen": 136760905, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.5078125, "step": 6375, "time_per_iteration": 2.3789172172546387 }, { "auxiliary_loss_clip": 0.01071459, "auxiliary_loss_mlp": 0.01030654, "balance_loss_clip": 1.01647997, "balance_loss_mlp": 1.0228107, "epoch": 0.3833458590109725, "flos": 17379169845120.0, "grad_norm": 2.4970649556509303, "language_loss": 0.72878611, "learning_rate": 2.7170298298160558e-06, "loss": 0.74980718, "num_input_tokens_seen": 136777240, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.48632812, "step": 6376, "time_per_iteration": 3.754441261291504 }, { "auxiliary_loss_clip": 0.01068327, "auxiliary_loss_mlp": 0.01027071, "balance_loss_clip": 1.01203871, "balance_loss_mlp": 1.02115405, "epoch": 0.3834059822636405, "flos": 29423262297600.0, "grad_norm": 1.6191556308667414, "language_loss": 0.67835015, "learning_rate": 2.7166771108137373e-06, "loss": 0.6993041, "num_input_tokens_seen": 136801040, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.47265625, "step": 6377, "time_per_iteration": 2.451864242553711 }, { "auxiliary_loss_clip": 0.01072518, "auxiliary_loss_mlp": 0.01032173, "balance_loss_clip": 1.01624131, "balance_loss_mlp": 1.02335477, "epoch": 0.38346610551630844, "flos": 21469663624320.0, "grad_norm": 1.8065877179990457, "language_loss": 0.73104817, "learning_rate": 2.7163243662350574e-06, "loss": 0.75209504, "num_input_tokens_seen": 136819495, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.4921875, "step": 6378, "time_per_iteration": 2.3680737018585205 }, { "auxiliary_loss_clip": 0.01073427, "auxiliary_loss_mlp": 0.01033182, "balance_loss_clip": 1.01871037, "balance_loss_mlp": 1.02270603, "epoch": 0.3835262287689764, "flos": 27560052570240.0, "grad_norm": 1.7622105152330363, "language_loss": 0.69400299, "learning_rate": 2.7159715960926025e-06, "loss": 0.71506906, "num_input_tokens_seen": 136838840, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.5078125, "step": 6379, "time_per_iteration": 2.443986415863037 }, { "auxiliary_loss_clip": 0.01070451, "auxiliary_loss_mlp": 0.01028671, "balance_loss_clip": 1.01378214, "balance_loss_mlp": 1.02287591, "epoch": 0.3835863520216444, "flos": 15522802744320.0, "grad_norm": 1.7018887126676754, "language_loss": 0.83296108, "learning_rate": 2.715618800398963e-06, "loss": 0.85395229, "num_input_tokens_seen": 136854425, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.47460938, "step": 6380, "time_per_iteration": 3.7976813316345215 }, { "auxiliary_loss_clip": 0.01070669, "auxiliary_loss_mlp": 0.01024584, "balance_loss_clip": 1.01063609, "balance_loss_mlp": 1.02305818, "epoch": 0.38364647527431234, "flos": 21903948426240.0, "grad_norm": 1.3731764596619855, "language_loss": 0.81131947, "learning_rate": 2.7152659791667296e-06, "loss": 0.83227193, "num_input_tokens_seen": 136874355, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.4765625, "step": 6381, "time_per_iteration": 2.413869857788086 }, { "auxiliary_loss_clip": 0.01011962, "auxiliary_loss_mlp": 0.01016199, "balance_loss_clip": 1.01488221, "balance_loss_mlp": 1.00208092, "epoch": 0.3837065985269803, "flos": 65531902798080.0, "grad_norm": 0.7940873470552489, "language_loss": 0.60454381, "learning_rate": 2.7149131324084925e-06, "loss": 0.62482536, "num_input_tokens_seen": 136937475, "router_z_loss_clip": 0.01318359, "router_z_loss_mlp": 0.09863281, "step": 6382, "time_per_iteration": 4.399012565612793 }, { "auxiliary_loss_clip": 0.01075227, "auxiliary_loss_mlp": 0.01027514, "balance_loss_clip": 1.01227927, "balance_loss_mlp": 1.0229882, "epoch": 0.38376672177964827, "flos": 28255347763200.0, "grad_norm": 4.7659232854408815, "language_loss": 0.66728902, "learning_rate": 2.714560260136846e-06, "loss": 0.68831635, "num_input_tokens_seen": 136955805, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.5234375, "step": 6383, "time_per_iteration": 2.427001953125 }, { "auxiliary_loss_clip": 0.01073185, "auxiliary_loss_mlp": 0.0102933, "balance_loss_clip": 1.01498294, "balance_loss_mlp": 1.02327538, "epoch": 0.38382684503231623, "flos": 20630316695040.0, "grad_norm": 1.6175670644657814, "language_loss": 0.74437582, "learning_rate": 2.714207362364381e-06, "loss": 0.76540101, "num_input_tokens_seen": 136975240, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.5, "step": 6384, "time_per_iteration": 2.3825740814208984 }, { "auxiliary_loss_clip": 0.01071315, "auxiliary_loss_mlp": 0.01027726, "balance_loss_clip": 1.01310515, "balance_loss_mlp": 1.02346826, "epoch": 0.3838869682849842, "flos": 19604917797120.0, "grad_norm": 1.5585058862983203, "language_loss": 0.76371676, "learning_rate": 2.7138544391036925e-06, "loss": 0.78470719, "num_input_tokens_seen": 136994985, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.47851562, "step": 6385, "time_per_iteration": 2.400175094604492 }, { "auxiliary_loss_clip": 0.01011066, "auxiliary_loss_mlp": 0.01001261, "balance_loss_clip": 0.99994946, "balance_loss_mlp": 1.00165391, "epoch": 0.38394709153765216, "flos": 56553428897280.0, "grad_norm": 0.9083990369851577, "language_loss": 0.67076528, "learning_rate": 2.7135014903673748e-06, "loss": 0.69088852, "num_input_tokens_seen": 137046290, "router_z_loss_clip": 0.01312256, "router_z_loss_mlp": 0.09423828, "step": 6386, "time_per_iteration": 4.288057327270508 }, { "auxiliary_loss_clip": 0.01071762, "auxiliary_loss_mlp": 0.01027539, "balance_loss_clip": 1.01443815, "balance_loss_mlp": 1.02398896, "epoch": 0.3840072147903202, "flos": 15887819675520.0, "grad_norm": 1.6717493867185969, "language_loss": 0.72432387, "learning_rate": 2.713148516168025e-06, "loss": 0.74531686, "num_input_tokens_seen": 137064725, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.47851562, "step": 6387, "time_per_iteration": 2.398987293243408 }, { "auxiliary_loss_clip": 0.01074301, "auxiliary_loss_mlp": 0.01031313, "balance_loss_clip": 1.0169003, "balance_loss_mlp": 1.02620327, "epoch": 0.38406733804298815, "flos": 28216838666880.0, "grad_norm": 1.5624960160758032, "language_loss": 0.81052637, "learning_rate": 2.712795516518239e-06, "loss": 0.83158249, "num_input_tokens_seen": 137086030, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.48046875, "step": 6388, "time_per_iteration": 2.438119411468506 }, { "auxiliary_loss_clip": 0.01069137, "auxiliary_loss_mlp": 0.01025842, "balance_loss_clip": 1.01230025, "balance_loss_mlp": 1.02218246, "epoch": 0.3841274612956561, "flos": 18222601403520.0, "grad_norm": 1.9034531866687259, "language_loss": 0.76234365, "learning_rate": 2.7124424914306143e-06, "loss": 0.78329349, "num_input_tokens_seen": 137105400, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.47070312, "step": 6389, "time_per_iteration": 2.367206573486328 }, { "auxiliary_loss_clip": 0.01075656, "auxiliary_loss_mlp": 0.01037016, "balance_loss_clip": 1.02145958, "balance_loss_mlp": 1.02457583, "epoch": 0.3841875845483241, "flos": 19791842549760.0, "grad_norm": 3.0480319906333055, "language_loss": 0.76977909, "learning_rate": 2.71208944091775e-06, "loss": 0.79090583, "num_input_tokens_seen": 137124985, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.51171875, "step": 6390, "time_per_iteration": 2.382983446121216 }, { "auxiliary_loss_clip": 0.01073509, "auxiliary_loss_mlp": 0.01039734, "balance_loss_clip": 1.02360523, "balance_loss_mlp": 1.02343667, "epoch": 0.38424770780099204, "flos": 29897522472960.0, "grad_norm": 1.542899672739723, "language_loss": 0.69253361, "learning_rate": 2.7117363649922453e-06, "loss": 0.71366596, "num_input_tokens_seen": 137146745, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.5, "step": 6391, "time_per_iteration": 2.4826838970184326 }, { "auxiliary_loss_clip": 0.01072126, "auxiliary_loss_mlp": 0.01035514, "balance_loss_clip": 1.01970649, "balance_loss_mlp": 1.0217371, "epoch": 0.38430783105366, "flos": 20812668059520.0, "grad_norm": 1.7190900938160967, "language_loss": 0.84027886, "learning_rate": 2.7113832636667e-06, "loss": 0.8613553, "num_input_tokens_seen": 137163195, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.50390625, "step": 6392, "time_per_iteration": 2.3855299949645996 }, { "auxiliary_loss_clip": 0.01071006, "auxiliary_loss_mlp": 0.01033321, "balance_loss_clip": 1.01841426, "balance_loss_mlp": 1.02166486, "epoch": 0.384367954306328, "flos": 10997814695040.0, "grad_norm": 2.2771003122968074, "language_loss": 0.61486125, "learning_rate": 2.7110301369537168e-06, "loss": 0.63590455, "num_input_tokens_seen": 137179330, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.49414062, "step": 6393, "time_per_iteration": 2.3465282917022705 }, { "auxiliary_loss_clip": 0.01074606, "auxiliary_loss_mlp": 0.01033036, "balance_loss_clip": 1.01738417, "balance_loss_mlp": 1.02258933, "epoch": 0.38442807755899594, "flos": 25336853147520.0, "grad_norm": 2.2141847589341936, "language_loss": 0.71160275, "learning_rate": 2.7106769848658965e-06, "loss": 0.73267913, "num_input_tokens_seen": 137198655, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.51953125, "step": 6394, "time_per_iteration": 2.417149066925049 }, { "auxiliary_loss_clip": 0.010775, "auxiliary_loss_mlp": 0.01034021, "balance_loss_clip": 1.01730824, "balance_loss_mlp": 1.02492261, "epoch": 0.3844882008116639, "flos": 21068686126080.0, "grad_norm": 1.971583195942539, "language_loss": 0.80974996, "learning_rate": 2.710323807415843e-06, "loss": 0.83086514, "num_input_tokens_seen": 137217120, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.52734375, "step": 6395, "time_per_iteration": 2.3835811614990234 }, { "auxiliary_loss_clip": 0.01073392, "auxiliary_loss_mlp": 0.0102853, "balance_loss_clip": 1.01352715, "balance_loss_mlp": 1.02477539, "epoch": 0.38454832406433187, "flos": 17962393973760.0, "grad_norm": 1.8966165835568778, "language_loss": 0.70925415, "learning_rate": 2.7099706046161593e-06, "loss": 0.73027331, "num_input_tokens_seen": 137234410, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.48632812, "step": 6396, "time_per_iteration": 2.3668510913848877 }, { "auxiliary_loss_clip": 0.01011183, "auxiliary_loss_mlp": 0.01005274, "balance_loss_clip": 1.00379539, "balance_loss_mlp": 1.00181806, "epoch": 0.38460844731699984, "flos": 67921392493440.0, "grad_norm": 1.0275815071984948, "language_loss": 0.5960207, "learning_rate": 2.7096173764794514e-06, "loss": 0.61618525, "num_input_tokens_seen": 137294940, "router_z_loss_clip": 0.01477051, "router_z_loss_mlp": 0.09375, "step": 6397, "time_per_iteration": 3.0761427879333496 }, { "auxiliary_loss_clip": 0.0107341, "auxiliary_loss_mlp": 0.01027271, "balance_loss_clip": 1.01259041, "balance_loss_mlp": 1.02422559, "epoch": 0.3846685705696678, "flos": 25847876851200.0, "grad_norm": 1.8712059024006888, "language_loss": 0.84767878, "learning_rate": 2.7092641230183243e-06, "loss": 0.86868554, "num_input_tokens_seen": 137315035, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.4921875, "step": 6398, "time_per_iteration": 2.4344027042388916 }, { "auxiliary_loss_clip": 0.01071551, "auxiliary_loss_mlp": 0.01024638, "balance_loss_clip": 1.01069093, "balance_loss_mlp": 1.02309728, "epoch": 0.38472869382233577, "flos": 16289251021440.0, "grad_norm": 2.3018006319989928, "language_loss": 0.79428566, "learning_rate": 2.7089108442453854e-06, "loss": 0.81524754, "num_input_tokens_seen": 137333155, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.484375, "step": 6399, "time_per_iteration": 2.3623061180114746 }, { "auxiliary_loss_clip": 0.01073319, "auxiliary_loss_mlp": 0.01027959, "balance_loss_clip": 1.01175857, "balance_loss_mlp": 1.02303982, "epoch": 0.38478881707500373, "flos": 19352146487040.0, "grad_norm": 1.7496453168028605, "language_loss": 0.66749483, "learning_rate": 2.7085575401732423e-06, "loss": 0.68850756, "num_input_tokens_seen": 137351515, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.50390625, "step": 6400, "time_per_iteration": 2.3797361850738525 }, { "auxiliary_loss_clip": 0.01074579, "auxiliary_loss_mlp": 0.01034192, "balance_loss_clip": 1.01956546, "balance_loss_mlp": 1.02425206, "epoch": 0.38484894032767175, "flos": 24859765152000.0, "grad_norm": 1.8224421794010612, "language_loss": 0.73357236, "learning_rate": 2.708204210814503e-06, "loss": 0.75466013, "num_input_tokens_seen": 137371255, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.50390625, "step": 6401, "time_per_iteration": 2.4101078510284424 }, { "auxiliary_loss_clip": 0.01072633, "auxiliary_loss_mlp": 0.01033675, "balance_loss_clip": 1.01918507, "balance_loss_mlp": 1.02406967, "epoch": 0.3849090635803397, "flos": 14500929893760.0, "grad_norm": 1.9233321395908127, "language_loss": 0.7154358, "learning_rate": 2.707850856181777e-06, "loss": 0.73649889, "num_input_tokens_seen": 137388980, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.48632812, "step": 6402, "time_per_iteration": 2.362643003463745 }, { "auxiliary_loss_clip": 0.01069168, "auxiliary_loss_mlp": 0.01026851, "balance_loss_clip": 1.01277208, "balance_loss_mlp": 1.02213025, "epoch": 0.3849691868330077, "flos": 18514859328000.0, "grad_norm": 2.4061362794420322, "language_loss": 0.83045423, "learning_rate": 2.707497476287675e-06, "loss": 0.85141438, "num_input_tokens_seen": 137406885, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.47070312, "step": 6403, "time_per_iteration": 2.3753793239593506 }, { "auxiliary_loss_clip": 0.01071288, "auxiliary_loss_mlp": 0.01029477, "balance_loss_clip": 1.01444423, "balance_loss_mlp": 1.02305806, "epoch": 0.38502931008567565, "flos": 21615321283200.0, "grad_norm": 1.9041392491294706, "language_loss": 0.82957143, "learning_rate": 2.7071440711448077e-06, "loss": 0.85057902, "num_input_tokens_seen": 137425535, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.48242188, "step": 6404, "time_per_iteration": 2.403550148010254 }, { "auxiliary_loss_clip": 0.01074332, "auxiliary_loss_mlp": 0.01029241, "balance_loss_clip": 1.0150671, "balance_loss_mlp": 1.02420688, "epoch": 0.3850894333383436, "flos": 25414045896960.0, "grad_norm": 1.4784060048485066, "language_loss": 0.69566846, "learning_rate": 2.7067906407657877e-06, "loss": 0.71670413, "num_input_tokens_seen": 137447700, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.5, "step": 6405, "time_per_iteration": 2.500673770904541 }, { "auxiliary_loss_clip": 0.01069248, "auxiliary_loss_mlp": 0.01028059, "balance_loss_clip": 1.01427257, "balance_loss_mlp": 1.02266669, "epoch": 0.3851495565910116, "flos": 20226895401600.0, "grad_norm": 1.922006172058228, "language_loss": 0.78940684, "learning_rate": 2.706437185163228e-06, "loss": 0.81037986, "num_input_tokens_seen": 137462245, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.46484375, "step": 6406, "time_per_iteration": 2.3813631534576416 }, { "auxiliary_loss_clip": 0.01075237, "auxiliary_loss_mlp": 0.01030729, "balance_loss_clip": 1.01598334, "balance_loss_mlp": 1.02550876, "epoch": 0.38520967984367954, "flos": 16507528041600.0, "grad_norm": 2.819893938423056, "language_loss": 0.84415394, "learning_rate": 2.7060837043497416e-06, "loss": 0.86521363, "num_input_tokens_seen": 137476455, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.49609375, "step": 6407, "time_per_iteration": 2.3337297439575195 }, { "auxiliary_loss_clip": 0.01012659, "auxiliary_loss_mlp": 0.01004048, "balance_loss_clip": 1.00264144, "balance_loss_mlp": 1.00296116, "epoch": 0.3852698030963475, "flos": 61310553776640.0, "grad_norm": 0.8243768979758257, "language_loss": 0.64843804, "learning_rate": 2.7057301983379452e-06, "loss": 0.66860509, "num_input_tokens_seen": 137539845, "router_z_loss_clip": 0.01403809, "router_z_loss_mlp": 0.09667969, "step": 6408, "time_per_iteration": 3.0884344577789307 }, { "auxiliary_loss_clip": 0.01072947, "auxiliary_loss_mlp": 0.01031923, "balance_loss_clip": 1.01689124, "balance_loss_mlp": 1.02335691, "epoch": 0.3853299263490155, "flos": 22891920480000.0, "grad_norm": 2.2340149704786887, "language_loss": 0.73693776, "learning_rate": 2.705376667140452e-06, "loss": 0.75798643, "num_input_tokens_seen": 137559880, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.49609375, "step": 6409, "time_per_iteration": 2.39340877532959 }, { "auxiliary_loss_clip": 0.01078745, "auxiliary_loss_mlp": 0.01043236, "balance_loss_clip": 1.02677345, "balance_loss_mlp": 1.02513075, "epoch": 0.38539004960168344, "flos": 20046464161920.0, "grad_norm": 2.003972399415588, "language_loss": 0.70190561, "learning_rate": 2.705023110769881e-06, "loss": 0.7231254, "num_input_tokens_seen": 137578225, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.53515625, "step": 6410, "time_per_iteration": 2.3725991249084473 }, { "auxiliary_loss_clip": 0.01010867, "auxiliary_loss_mlp": 0.01001821, "balance_loss_clip": 1.00045645, "balance_loss_mlp": 1.00138807, "epoch": 0.3854501728543514, "flos": 68726978271360.0, "grad_norm": 0.6712661248004854, "language_loss": 0.60383004, "learning_rate": 2.7046695292388485e-06, "loss": 0.62395692, "num_input_tokens_seen": 137645770, "router_z_loss_clip": 0.01367188, "router_z_loss_mlp": 0.09472656, "step": 6411, "time_per_iteration": 3.1204044818878174 }, { "auxiliary_loss_clip": 0.01070236, "auxiliary_loss_mlp": 0.01027987, "balance_loss_clip": 1.01416492, "balance_loss_mlp": 1.02227807, "epoch": 0.38551029610701937, "flos": 20483995720320.0, "grad_norm": 1.7445958039108738, "language_loss": 0.77560854, "learning_rate": 2.7043159225599727e-06, "loss": 0.79659081, "num_input_tokens_seen": 137664090, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.48046875, "step": 6412, "time_per_iteration": 2.3860766887664795 }, { "auxiliary_loss_clip": 0.01074559, "auxiliary_loss_mlp": 0.01031584, "balance_loss_clip": 1.01528811, "balance_loss_mlp": 1.02346826, "epoch": 0.38557041935968733, "flos": 23470815600000.0, "grad_norm": 2.3715598386389916, "language_loss": 0.77797347, "learning_rate": 2.703962290745874e-06, "loss": 0.79903489, "num_input_tokens_seen": 137683190, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.51171875, "step": 6413, "time_per_iteration": 2.4085443019866943 }, { "auxiliary_loss_clip": 0.01010277, "auxiliary_loss_mlp": 0.01000332, "balance_loss_clip": 0.99906224, "balance_loss_mlp": 1.00077569, "epoch": 0.38563054261235535, "flos": 63963639169920.0, "grad_norm": 0.8244576745760341, "language_loss": 0.61250973, "learning_rate": 2.703608633809171e-06, "loss": 0.6326158, "num_input_tokens_seen": 137737315, "router_z_loss_clip": 0.01269531, "router_z_loss_mlp": 0.09472656, "step": 6414, "time_per_iteration": 2.9032487869262695 }, { "auxiliary_loss_clip": 0.01075195, "auxiliary_loss_mlp": 0.01029152, "balance_loss_clip": 1.01401854, "balance_loss_mlp": 1.02491593, "epoch": 0.3856906658650233, "flos": 23986657071360.0, "grad_norm": 2.068949848606914, "language_loss": 0.7726903, "learning_rate": 2.7032549517624865e-06, "loss": 0.79373378, "num_input_tokens_seen": 137753535, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.50390625, "step": 6415, "time_per_iteration": 2.40846586227417 }, { "auxiliary_loss_clip": 0.01066083, "auxiliary_loss_mlp": 0.01023546, "balance_loss_clip": 1.01011753, "balance_loss_mlp": 1.02223635, "epoch": 0.3857507891176913, "flos": 25006330506240.0, "grad_norm": 1.6635642160114679, "language_loss": 0.79608589, "learning_rate": 2.702901244618442e-06, "loss": 0.81698215, "num_input_tokens_seen": 137773405, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.43945312, "step": 6416, "time_per_iteration": 3.8112735748291016 }, { "auxiliary_loss_clip": 0.01071455, "auxiliary_loss_mlp": 0.0102933, "balance_loss_clip": 1.01581168, "balance_loss_mlp": 1.02252793, "epoch": 0.38581091237035925, "flos": 21535894206720.0, "grad_norm": 1.768979154164975, "language_loss": 0.7882784, "learning_rate": 2.7025475123896597e-06, "loss": 0.80928624, "num_input_tokens_seen": 137790810, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.48828125, "step": 6417, "time_per_iteration": 2.4051363468170166 }, { "auxiliary_loss_clip": 0.01070115, "auxiliary_loss_mlp": 0.01027896, "balance_loss_clip": 1.01463938, "balance_loss_mlp": 1.0216831, "epoch": 0.3858710356230272, "flos": 17382940272000.0, "grad_norm": 2.0697801577287827, "language_loss": 0.79743356, "learning_rate": 2.702193755088764e-06, "loss": 0.81841362, "num_input_tokens_seen": 137810265, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.48242188, "step": 6418, "time_per_iteration": 2.3811984062194824 }, { "auxiliary_loss_clip": 0.01069308, "auxiliary_loss_mlp": 0.01025373, "balance_loss_clip": 1.01234293, "balance_loss_mlp": 1.02141714, "epoch": 0.3859311588756952, "flos": 20338547529600.0, "grad_norm": 1.8091425112120791, "language_loss": 0.79684544, "learning_rate": 2.701839972728379e-06, "loss": 0.8177923, "num_input_tokens_seen": 137828580, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.48046875, "step": 6419, "time_per_iteration": 2.405202627182007 }, { "auxiliary_loss_clip": 0.01067784, "auxiliary_loss_mlp": 0.01028805, "balance_loss_clip": 1.0134511, "balance_loss_mlp": 1.02183938, "epoch": 0.38599128212836314, "flos": 26320007433600.0, "grad_norm": 2.031902158674048, "language_loss": 0.67538393, "learning_rate": 2.7014861653211314e-06, "loss": 0.6963498, "num_input_tokens_seen": 137846145, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.45898438, "step": 6420, "time_per_iteration": 3.852813482284546 }, { "auxiliary_loss_clip": 0.01070134, "auxiliary_loss_mlp": 0.01026774, "balance_loss_clip": 1.01369035, "balance_loss_mlp": 1.02428031, "epoch": 0.3860514053810311, "flos": 13552968124800.0, "grad_norm": 1.9556236432527907, "language_loss": 0.81696451, "learning_rate": 2.701132332879646e-06, "loss": 0.83793354, "num_input_tokens_seen": 137863705, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.45898438, "step": 6421, "time_per_iteration": 3.7698209285736084 }, { "auxiliary_loss_clip": 0.0107089, "auxiliary_loss_mlp": 0.01025447, "balance_loss_clip": 1.01142216, "balance_loss_mlp": 1.02289939, "epoch": 0.3861115286336991, "flos": 20953368305280.0, "grad_norm": 2.181946551620686, "language_loss": 0.71599036, "learning_rate": 2.700778475416552e-06, "loss": 0.73695374, "num_input_tokens_seen": 137880285, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.47851562, "step": 6422, "time_per_iteration": 2.382434129714966 }, { "auxiliary_loss_clip": 0.01069288, "auxiliary_loss_mlp": 0.01024515, "balance_loss_clip": 1.01148498, "balance_loss_mlp": 1.02364755, "epoch": 0.38617165188636704, "flos": 16361765648640.0, "grad_norm": 1.5707566251050673, "language_loss": 0.66728193, "learning_rate": 2.7004245929444776e-06, "loss": 0.6882199, "num_input_tokens_seen": 137898335, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.45703125, "step": 6423, "time_per_iteration": 2.401672124862671 }, { "auxiliary_loss_clip": 0.01072334, "auxiliary_loss_mlp": 0.01028594, "balance_loss_clip": 1.01381183, "balance_loss_mlp": 1.02393031, "epoch": 0.386231775139035, "flos": 34785851708160.0, "grad_norm": 1.7924744273413715, "language_loss": 0.68787992, "learning_rate": 2.7000706854760504e-06, "loss": 0.70888919, "num_input_tokens_seen": 137918605, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.48242188, "step": 6424, "time_per_iteration": 2.5371901988983154 }, { "auxiliary_loss_clip": 0.01068759, "auxiliary_loss_mlp": 0.01031866, "balance_loss_clip": 1.01734054, "balance_loss_mlp": 1.02193975, "epoch": 0.38629189839170297, "flos": 21725088197760.0, "grad_norm": 1.3597977567548862, "language_loss": 0.72149193, "learning_rate": 2.699716753023901e-06, "loss": 0.74249816, "num_input_tokens_seen": 137938245, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.46875, "step": 6425, "time_per_iteration": 3.8290693759918213 }, { "auxiliary_loss_clip": 0.01073253, "auxiliary_loss_mlp": 0.01033421, "balance_loss_clip": 1.01898479, "balance_loss_mlp": 1.02292788, "epoch": 0.38635202164437094, "flos": 27922520972160.0, "grad_norm": 1.8597590825559922, "language_loss": 0.81127745, "learning_rate": 2.69936279560066e-06, "loss": 0.83234417, "num_input_tokens_seen": 137956770, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.50390625, "step": 6426, "time_per_iteration": 2.444472312927246 }, { "auxiliary_loss_clip": 0.01071708, "auxiliary_loss_mlp": 0.01033324, "balance_loss_clip": 1.01869082, "balance_loss_mlp": 1.02344429, "epoch": 0.38641214489703896, "flos": 23585505016320.0, "grad_norm": 1.9859570078916087, "language_loss": 0.7460295, "learning_rate": 2.699008813218961e-06, "loss": 0.76707983, "num_input_tokens_seen": 137977040, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.48242188, "step": 6427, "time_per_iteration": 2.413057327270508 }, { "auxiliary_loss_clip": 0.01069843, "auxiliary_loss_mlp": 0.01032713, "balance_loss_clip": 1.01843154, "balance_loss_mlp": 1.02340186, "epoch": 0.3864722681497069, "flos": 12640408341120.0, "grad_norm": 2.1221194787307676, "language_loss": 0.70528924, "learning_rate": 2.698654805891435e-06, "loss": 0.72631478, "num_input_tokens_seen": 137993545, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.46484375, "step": 6428, "time_per_iteration": 2.3660902976989746 }, { "auxiliary_loss_clip": 0.01072193, "auxiliary_loss_mlp": 0.01032739, "balance_loss_clip": 1.01941705, "balance_loss_mlp": 1.02372837, "epoch": 0.3865323914023749, "flos": 17598075269760.0, "grad_norm": 2.2501368626028895, "language_loss": 0.84230453, "learning_rate": 2.6983007736307158e-06, "loss": 0.86335385, "num_input_tokens_seen": 138010140, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.48632812, "step": 6429, "time_per_iteration": 2.353111743927002 }, { "auxiliary_loss_clip": 0.01072221, "auxiliary_loss_mlp": 0.01027437, "balance_loss_clip": 1.01343548, "balance_loss_mlp": 1.02398658, "epoch": 0.38659251465504285, "flos": 18477956154240.0, "grad_norm": 2.1164698228429746, "language_loss": 0.81232369, "learning_rate": 2.6979467164494387e-06, "loss": 0.8333202, "num_input_tokens_seen": 138028880, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.48046875, "step": 6430, "time_per_iteration": 2.406045913696289 }, { "auxiliary_loss_clip": 0.01012101, "auxiliary_loss_mlp": 0.01012191, "balance_loss_clip": 1.01060581, "balance_loss_mlp": 1.00226521, "epoch": 0.3866526379077108, "flos": 64162259228160.0, "grad_norm": 0.7232263840275747, "language_loss": 0.58855486, "learning_rate": 2.697592634360238e-06, "loss": 0.60879779, "num_input_tokens_seen": 138098090, "router_z_loss_clip": 0.01586914, "router_z_loss_mlp": 0.09814453, "step": 6431, "time_per_iteration": 3.0711395740509033 }, { "auxiliary_loss_clip": 0.01072427, "auxiliary_loss_mlp": 0.01028632, "balance_loss_clip": 1.01259255, "balance_loss_mlp": 1.02267051, "epoch": 0.3867127611603788, "flos": 14387532197760.0, "grad_norm": 2.257020157116884, "language_loss": 0.79567808, "learning_rate": 2.6972385273757513e-06, "loss": 0.81668866, "num_input_tokens_seen": 138114735, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.49804688, "step": 6432, "time_per_iteration": 2.3641786575317383 }, { "auxiliary_loss_clip": 0.0107401, "auxiliary_loss_mlp": 0.01032301, "balance_loss_clip": 1.01705456, "balance_loss_mlp": 1.02310538, "epoch": 0.38677288441304675, "flos": 20009735544960.0, "grad_norm": 2.141807986625703, "language_loss": 0.80594218, "learning_rate": 2.6968843955086155e-06, "loss": 0.82700533, "num_input_tokens_seen": 138130480, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.5078125, "step": 6433, "time_per_iteration": 2.3678066730499268 }, { "auxiliary_loss_clip": 0.01073231, "auxiliary_loss_mlp": 0.01029923, "balance_loss_clip": 1.01449776, "balance_loss_mlp": 1.02351093, "epoch": 0.3868330076657147, "flos": 22235797699200.0, "grad_norm": 1.5544772490871022, "language_loss": 0.70720983, "learning_rate": 2.696530238771467e-06, "loss": 0.72824132, "num_input_tokens_seen": 138150640, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.49609375, "step": 6434, "time_per_iteration": 2.4034829139709473 }, { "auxiliary_loss_clip": 0.01074369, "auxiliary_loss_mlp": 0.01032387, "balance_loss_clip": 1.01748013, "balance_loss_mlp": 1.02348089, "epoch": 0.3868931309183827, "flos": 16726503288960.0, "grad_norm": 1.6909865285431163, "language_loss": 0.77458197, "learning_rate": 2.696176057176947e-06, "loss": 0.79564953, "num_input_tokens_seen": 138169700, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.5078125, "step": 6435, "time_per_iteration": 2.358459234237671 }, { "auxiliary_loss_clip": 0.01071084, "auxiliary_loss_mlp": 0.01033566, "balance_loss_clip": 1.01889133, "balance_loss_mlp": 1.02302909, "epoch": 0.38695325417105064, "flos": 22673608548480.0, "grad_norm": 1.6545892669921791, "language_loss": 0.79653662, "learning_rate": 2.6958218507376936e-06, "loss": 0.81758314, "num_input_tokens_seen": 138185835, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.48046875, "step": 6436, "time_per_iteration": 2.393425226211548 }, { "auxiliary_loss_clip": 0.01069698, "auxiliary_loss_mlp": 0.01029697, "balance_loss_clip": 1.01666141, "balance_loss_mlp": 1.02266824, "epoch": 0.3870133774237186, "flos": 23110930638720.0, "grad_norm": 1.6240719441988032, "language_loss": 0.76536834, "learning_rate": 2.6954676194663486e-06, "loss": 0.78636229, "num_input_tokens_seen": 138204080, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.47070312, "step": 6437, "time_per_iteration": 2.3904120922088623 }, { "auxiliary_loss_clip": 0.01070421, "auxiliary_loss_mlp": 0.01035005, "balance_loss_clip": 1.02124822, "balance_loss_mlp": 1.02351093, "epoch": 0.3870735006763866, "flos": 17674744348800.0, "grad_norm": 2.1502736092777988, "language_loss": 0.81843197, "learning_rate": 2.6951133633755538e-06, "loss": 0.83948618, "num_input_tokens_seen": 138220710, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.46875, "step": 6438, "time_per_iteration": 2.3632795810699463 }, { "auxiliary_loss_clip": 0.01072497, "auxiliary_loss_mlp": 0.01032213, "balance_loss_clip": 1.01767588, "balance_loss_mlp": 1.02363777, "epoch": 0.38713362392905454, "flos": 23294643546240.0, "grad_norm": 1.737557186528857, "language_loss": 0.75216937, "learning_rate": 2.6947590824779502e-06, "loss": 0.77321649, "num_input_tokens_seen": 138241720, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.48828125, "step": 6439, "time_per_iteration": 2.3971452713012695 }, { "auxiliary_loss_clip": 0.01068407, "auxiliary_loss_mlp": 0.0103023, "balance_loss_clip": 1.01677775, "balance_loss_mlp": 1.02272558, "epoch": 0.38719374718172256, "flos": 21030177029760.0, "grad_norm": 1.4625146126776782, "language_loss": 0.73726898, "learning_rate": 2.694404776786182e-06, "loss": 0.75825536, "num_input_tokens_seen": 138261885, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.45703125, "step": 6440, "time_per_iteration": 2.4049558639526367 }, { "auxiliary_loss_clip": 0.01072184, "auxiliary_loss_mlp": 0.01031043, "balance_loss_clip": 1.01621962, "balance_loss_mlp": 1.02272618, "epoch": 0.3872538704343905, "flos": 19608758046720.0, "grad_norm": 1.9449260359710494, "language_loss": 0.82023013, "learning_rate": 2.6940504463128933e-06, "loss": 0.8412624, "num_input_tokens_seen": 138280255, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.49414062, "step": 6441, "time_per_iteration": 2.3869669437408447 }, { "auxiliary_loss_clip": 0.01072746, "auxiliary_loss_mlp": 0.01035742, "balance_loss_clip": 1.02196097, "balance_loss_mlp": 1.02449954, "epoch": 0.3873139936870585, "flos": 17529086689920.0, "grad_norm": 2.217150395137991, "language_loss": 0.81612539, "learning_rate": 2.6936960910707307e-06, "loss": 0.8372103, "num_input_tokens_seen": 138296675, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.48242188, "step": 6442, "time_per_iteration": 2.3692736625671387 }, { "auxiliary_loss_clip": 0.01070033, "auxiliary_loss_mlp": 0.01026524, "balance_loss_clip": 1.01205254, "balance_loss_mlp": 1.02191114, "epoch": 0.38737411693972645, "flos": 17785558604160.0, "grad_norm": 1.5835994959432471, "language_loss": 0.83717436, "learning_rate": 2.693341711072338e-06, "loss": 0.85813987, "num_input_tokens_seen": 138314985, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.48046875, "step": 6443, "time_per_iteration": 2.3825783729553223 }, { "auxiliary_loss_clip": 0.01010682, "auxiliary_loss_mlp": 0.01003713, "balance_loss_clip": 1.00215173, "balance_loss_mlp": 1.00128007, "epoch": 0.3874342401923944, "flos": 58301984119680.0, "grad_norm": 0.7545362952080468, "language_loss": 0.50255579, "learning_rate": 2.6929873063303634e-06, "loss": 0.52269971, "num_input_tokens_seen": 138373275, "router_z_loss_clip": 0.015625, "router_z_loss_mlp": 0.09375, "step": 6444, "time_per_iteration": 3.0356690883636475 }, { "auxiliary_loss_clip": 0.01068063, "auxiliary_loss_mlp": 0.01026805, "balance_loss_clip": 1.01370442, "balance_loss_mlp": 1.02287197, "epoch": 0.3874943634450624, "flos": 17710984206720.0, "grad_norm": 1.6195469748190225, "language_loss": 0.78630078, "learning_rate": 2.6926328768574545e-06, "loss": 0.80724943, "num_input_tokens_seen": 138391145, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.45117188, "step": 6445, "time_per_iteration": 2.3599650859832764 }, { "auxiliary_loss_clip": 0.01069329, "auxiliary_loss_mlp": 0.01025607, "balance_loss_clip": 1.01224995, "balance_loss_mlp": 1.02317977, "epoch": 0.38755448669773035, "flos": 19243845849600.0, "grad_norm": 1.9021278803043031, "language_loss": 0.80816722, "learning_rate": 2.6922784226662595e-06, "loss": 0.82911658, "num_input_tokens_seen": 138409875, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.4609375, "step": 6446, "time_per_iteration": 2.3811867237091064 }, { "auxiliary_loss_clip": 0.01069717, "auxiliary_loss_mlp": 0.01030068, "balance_loss_clip": 1.01606715, "balance_loss_mlp": 1.02237332, "epoch": 0.3876146099503983, "flos": 20593238964480.0, "grad_norm": 1.8365217279326018, "language_loss": 0.77433693, "learning_rate": 2.6919239437694288e-06, "loss": 0.79533482, "num_input_tokens_seen": 138428965, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.47265625, "step": 6447, "time_per_iteration": 2.3864388465881348 }, { "auxiliary_loss_clip": 0.01070298, "auxiliary_loss_mlp": 0.01027419, "balance_loss_clip": 1.01415086, "balance_loss_mlp": 1.02335882, "epoch": 0.3876747332030663, "flos": 19280120618880.0, "grad_norm": 1.5373124043863238, "language_loss": 0.7613622, "learning_rate": 2.691569440179612e-06, "loss": 0.78233933, "num_input_tokens_seen": 138448090, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.46875, "step": 6448, "time_per_iteration": 2.3873212337493896 }, { "auxiliary_loss_clip": 0.01067967, "auxiliary_loss_mlp": 0.01029386, "balance_loss_clip": 1.0157125, "balance_loss_mlp": 1.0216198, "epoch": 0.38773485645573424, "flos": 18945094412160.0, "grad_norm": 1.6357830520393237, "language_loss": 0.75671995, "learning_rate": 2.691214911909461e-06, "loss": 0.77769339, "num_input_tokens_seen": 138466105, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.46289062, "step": 6449, "time_per_iteration": 2.3578057289123535 }, { "auxiliary_loss_clip": 0.01070564, "auxiliary_loss_mlp": 0.0103165, "balance_loss_clip": 1.01674914, "balance_loss_mlp": 1.0209111, "epoch": 0.3877949797084022, "flos": 23070361772160.0, "grad_norm": 1.6775319351474467, "language_loss": 0.78557169, "learning_rate": 2.690860358971628e-06, "loss": 0.80659378, "num_input_tokens_seen": 138485160, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.49609375, "step": 6450, "time_per_iteration": 2.3988382816314697 }, { "auxiliary_loss_clip": 0.01074259, "auxiliary_loss_mlp": 0.01028328, "balance_loss_clip": 1.01351023, "balance_loss_mlp": 1.02337241, "epoch": 0.3878551029610702, "flos": 29094275756160.0, "grad_norm": 2.2474928407061308, "language_loss": 0.7726, "learning_rate": 2.690505781378766e-06, "loss": 0.79362583, "num_input_tokens_seen": 138504135, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.5078125, "step": 6451, "time_per_iteration": 2.4257657527923584 }, { "auxiliary_loss_clip": 0.01067293, "auxiliary_loss_mlp": 0.01025521, "balance_loss_clip": 1.01206207, "balance_loss_mlp": 1.02180934, "epoch": 0.38791522621373814, "flos": 20995333626240.0, "grad_norm": 2.1197352035668127, "language_loss": 0.76294684, "learning_rate": 2.6901511791435286e-06, "loss": 0.78387499, "num_input_tokens_seen": 138523955, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.45507812, "step": 6452, "time_per_iteration": 2.406665086746216 }, { "auxiliary_loss_clip": 0.0107, "auxiliary_loss_mlp": 0.01032745, "balance_loss_clip": 1.01923287, "balance_loss_mlp": 1.02287245, "epoch": 0.3879753494664061, "flos": 15485934481920.0, "grad_norm": 1.7227356610543463, "language_loss": 0.79668105, "learning_rate": 2.689796552278571e-06, "loss": 0.81770849, "num_input_tokens_seen": 138541655, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.47070312, "step": 6453, "time_per_iteration": 2.3568108081817627 }, { "auxiliary_loss_clip": 0.01075601, "auxiliary_loss_mlp": 0.01032157, "balance_loss_clip": 1.0161593, "balance_loss_mlp": 1.02417755, "epoch": 0.3880354727190741, "flos": 22052887752960.0, "grad_norm": 1.6979432927561684, "language_loss": 0.7157777, "learning_rate": 2.689441900796549e-06, "loss": 0.73685527, "num_input_tokens_seen": 138560860, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.515625, "step": 6454, "time_per_iteration": 2.392834186553955 }, { "auxiliary_loss_clip": 0.01073756, "auxiliary_loss_mlp": 0.01027454, "balance_loss_clip": 1.01199269, "balance_loss_mlp": 1.02341008, "epoch": 0.3880955959717421, "flos": 20339245756800.0, "grad_norm": 2.4324141555355134, "language_loss": 0.77854466, "learning_rate": 2.689087224710119e-06, "loss": 0.79955673, "num_input_tokens_seen": 138580200, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.50390625, "step": 6455, "time_per_iteration": 3.828409433364868 }, { "auxiliary_loss_clip": 0.01067146, "auxiliary_loss_mlp": 0.01024271, "balance_loss_clip": 1.0107286, "balance_loss_mlp": 1.0212636, "epoch": 0.38815571922441006, "flos": 23074306755840.0, "grad_norm": 1.4008675524529726, "language_loss": 0.75463307, "learning_rate": 2.688732524031938e-06, "loss": 0.77554727, "num_input_tokens_seen": 138598315, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.45898438, "step": 6456, "time_per_iteration": 2.392615556716919 }, { "auxiliary_loss_clip": 0.01073966, "auxiliary_loss_mlp": 0.01031225, "balance_loss_clip": 1.01632977, "balance_loss_mlp": 1.02410126, "epoch": 0.388215842477078, "flos": 20775904531200.0, "grad_norm": 2.467623895924287, "language_loss": 0.59536481, "learning_rate": 2.688377798774665e-06, "loss": 0.61641669, "num_input_tokens_seen": 138615695, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.49804688, "step": 6457, "time_per_iteration": 2.3738536834716797 }, { "auxiliary_loss_clip": 0.01073014, "auxiliary_loss_mlp": 0.01030647, "balance_loss_clip": 1.0152992, "balance_loss_mlp": 1.02263951, "epoch": 0.388275965729746, "flos": 20447162369280.0, "grad_norm": 2.0753033357475736, "language_loss": 0.79797566, "learning_rate": 2.688023048950959e-06, "loss": 0.81901228, "num_input_tokens_seen": 138633180, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.50390625, "step": 6458, "time_per_iteration": 2.3875813484191895 }, { "auxiliary_loss_clip": 0.01072008, "auxiliary_loss_mlp": 0.01025145, "balance_loss_clip": 1.01094759, "balance_loss_mlp": 1.0228076, "epoch": 0.38833608898241395, "flos": 27891133793280.0, "grad_norm": 1.8802103691728687, "language_loss": 0.81069863, "learning_rate": 2.6876682745734807e-06, "loss": 0.83167017, "num_input_tokens_seen": 138654785, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.4921875, "step": 6459, "time_per_iteration": 2.4388864040374756 }, { "auxiliary_loss_clip": 0.01071504, "auxiliary_loss_mlp": 0.01026047, "balance_loss_clip": 1.0121057, "balance_loss_mlp": 1.02348924, "epoch": 0.3883962122350819, "flos": 18075442556160.0, "grad_norm": 1.7271235675480758, "language_loss": 0.61615485, "learning_rate": 2.6873134756548902e-06, "loss": 0.63713038, "num_input_tokens_seen": 138673330, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.48046875, "step": 6460, "time_per_iteration": 3.7920982837677 }, { "auxiliary_loss_clip": 0.0107027, "auxiliary_loss_mlp": 0.01028064, "balance_loss_clip": 1.01505256, "balance_loss_mlp": 1.02309084, "epoch": 0.3884563354877499, "flos": 23621151381120.0, "grad_norm": 1.603888331922925, "language_loss": 0.86011147, "learning_rate": 2.6869586522078494e-06, "loss": 0.88109481, "num_input_tokens_seen": 138694185, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.47265625, "step": 6461, "time_per_iteration": 3.8378467559814453 }, { "auxiliary_loss_clip": 0.01071492, "auxiliary_loss_mlp": 0.01030206, "balance_loss_clip": 1.01687813, "balance_loss_mlp": 1.02263474, "epoch": 0.38851645874041785, "flos": 27452310514560.0, "grad_norm": 2.155288372346121, "language_loss": 0.70770979, "learning_rate": 2.686603804245022e-06, "loss": 0.72872674, "num_input_tokens_seen": 138714625, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.48828125, "step": 6462, "time_per_iteration": 2.4351794719696045 }, { "auxiliary_loss_clip": 0.01070858, "auxiliary_loss_mlp": 0.01025792, "balance_loss_clip": 1.01176691, "balance_loss_mlp": 1.02240705, "epoch": 0.3885765819930858, "flos": 25226911676160.0, "grad_norm": 2.0032015468710274, "language_loss": 0.7602641, "learning_rate": 2.6862489317790708e-06, "loss": 0.78123057, "num_input_tokens_seen": 138733585, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.48632812, "step": 6463, "time_per_iteration": 2.415416717529297 }, { "auxiliary_loss_clip": 0.01072443, "auxiliary_loss_mlp": 0.01037744, "balance_loss_clip": 1.02204442, "balance_loss_mlp": 1.02330184, "epoch": 0.3886367052457538, "flos": 16945653093120.0, "grad_norm": 2.460515244440767, "language_loss": 0.70125246, "learning_rate": 2.6858940348226606e-06, "loss": 0.72235441, "num_input_tokens_seen": 138752335, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.4921875, "step": 6464, "time_per_iteration": 2.359823703765869 }, { "auxiliary_loss_clip": 0.01070411, "auxiliary_loss_mlp": 0.01027254, "balance_loss_clip": 1.0133487, "balance_loss_mlp": 1.02377701, "epoch": 0.38869682849842174, "flos": 27153140140800.0, "grad_norm": 2.167047065494318, "language_loss": 0.69389498, "learning_rate": 2.685539113388456e-06, "loss": 0.71487164, "num_input_tokens_seen": 138768450, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.46679688, "step": 6465, "time_per_iteration": 3.7958550453186035 }, { "auxiliary_loss_clip": 0.01071652, "auxiliary_loss_mlp": 0.01037322, "balance_loss_clip": 1.02180052, "balance_loss_mlp": 1.02325916, "epoch": 0.3887569517510897, "flos": 21062716283520.0, "grad_norm": 1.9154723107647353, "language_loss": 0.77889287, "learning_rate": 2.6851841674891242e-06, "loss": 0.79998267, "num_input_tokens_seen": 138786775, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.484375, "step": 6466, "time_per_iteration": 2.3764679431915283 }, { "auxiliary_loss_clip": 0.01071944, "auxiliary_loss_mlp": 0.01038414, "balance_loss_clip": 1.0237987, "balance_loss_mlp": 1.02218235, "epoch": 0.38881707500375773, "flos": 29496091127040.0, "grad_norm": 1.5082812801305472, "language_loss": 0.69736731, "learning_rate": 2.6848291971373325e-06, "loss": 0.71847093, "num_input_tokens_seen": 138810100, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.49804688, "step": 6467, "time_per_iteration": 2.4498167037963867 }, { "auxiliary_loss_clip": 0.01070674, "auxiliary_loss_mlp": 0.01039417, "balance_loss_clip": 1.02347827, "balance_loss_mlp": 1.02192819, "epoch": 0.3888771982564257, "flos": 17487470482560.0, "grad_norm": 2.673371480529278, "language_loss": 0.83308017, "learning_rate": 2.684474202345748e-06, "loss": 0.85418105, "num_input_tokens_seen": 138825140, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.48632812, "step": 6468, "time_per_iteration": 2.345813035964966 }, { "auxiliary_loss_clip": 0.01069591, "auxiliary_loss_mlp": 0.01031796, "balance_loss_clip": 1.01799738, "balance_loss_mlp": 1.02141571, "epoch": 0.38893732150909366, "flos": 21941410181760.0, "grad_norm": 1.9098837271843332, "language_loss": 0.84474897, "learning_rate": 2.6841191831270394e-06, "loss": 0.86576289, "num_input_tokens_seen": 138844115, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.48242188, "step": 6469, "time_per_iteration": 2.384739875793457 }, { "auxiliary_loss_clip": 0.01068844, "auxiliary_loss_mlp": 0.010318, "balance_loss_clip": 1.01747084, "balance_loss_mlp": 1.0214411, "epoch": 0.3889974447617616, "flos": 24275319125760.0, "grad_norm": 1.532286436761178, "language_loss": 0.74667895, "learning_rate": 2.683764139493878e-06, "loss": 0.76768535, "num_input_tokens_seen": 138860860, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.47460938, "step": 6470, "time_per_iteration": 2.403606414794922 }, { "auxiliary_loss_clip": 0.01072047, "auxiliary_loss_mlp": 0.01030451, "balance_loss_clip": 1.01594281, "balance_loss_mlp": 1.02286077, "epoch": 0.3890575680144296, "flos": 25665909511680.0, "grad_norm": 1.8054630740411863, "language_loss": 0.74865347, "learning_rate": 2.683409071458932e-06, "loss": 0.76967847, "num_input_tokens_seen": 138881910, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.4921875, "step": 6471, "time_per_iteration": 2.4449217319488525 }, { "auxiliary_loss_clip": 0.01070529, "auxiliary_loss_mlp": 0.01028642, "balance_loss_clip": 1.0143311, "balance_loss_mlp": 1.02330923, "epoch": 0.38911769126709755, "flos": 22854214344960.0, "grad_norm": 1.6527159984556832, "language_loss": 0.6779635, "learning_rate": 2.6830539790348755e-06, "loss": 0.69895518, "num_input_tokens_seen": 138900975, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.47265625, "step": 6472, "time_per_iteration": 2.3853163719177246 }, { "auxiliary_loss_clip": 0.01069038, "auxiliary_loss_mlp": 0.01034152, "balance_loss_clip": 1.02073467, "balance_loss_mlp": 1.02235472, "epoch": 0.3891778145197655, "flos": 25446340771200.0, "grad_norm": 1.6312456325993039, "language_loss": 0.76454026, "learning_rate": 2.6826988622343783e-06, "loss": 0.78557217, "num_input_tokens_seen": 138920795, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.46679688, "step": 6473, "time_per_iteration": 2.413755178451538 }, { "auxiliary_loss_clip": 0.01073302, "auxiliary_loss_mlp": 0.01033323, "balance_loss_clip": 1.01818395, "balance_loss_mlp": 1.02502763, "epoch": 0.3892379377724335, "flos": 14027088654720.0, "grad_norm": 2.1327613707513464, "language_loss": 0.70278001, "learning_rate": 2.6823437210701155e-06, "loss": 0.72384632, "num_input_tokens_seen": 138938770, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.48242188, "step": 6474, "time_per_iteration": 2.3581089973449707 }, { "auxiliary_loss_clip": 0.01071001, "auxiliary_loss_mlp": 0.01027767, "balance_loss_clip": 1.01386678, "balance_loss_mlp": 1.02233028, "epoch": 0.38929806102510145, "flos": 20156405633280.0, "grad_norm": 1.9791197149879987, "language_loss": 0.68668908, "learning_rate": 2.68198855555476e-06, "loss": 0.70767677, "num_input_tokens_seen": 138958880, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.48828125, "step": 6475, "time_per_iteration": 2.410979747772217 }, { "auxiliary_loss_clip": 0.01076725, "auxiliary_loss_mlp": 0.01034471, "balance_loss_clip": 1.0193913, "balance_loss_mlp": 1.02451897, "epoch": 0.3893581842777694, "flos": 22162864135680.0, "grad_norm": 1.846463898915937, "language_loss": 0.76028711, "learning_rate": 2.6816333657009876e-06, "loss": 0.78139907, "num_input_tokens_seen": 138977240, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.5234375, "step": 6476, "time_per_iteration": 2.3859353065490723 }, { "auxiliary_loss_clip": 0.01012144, "auxiliary_loss_mlp": 0.01006416, "balance_loss_clip": 1.00499177, "balance_loss_mlp": 1.00233853, "epoch": 0.3894183075304374, "flos": 67298367548160.0, "grad_norm": 0.7936274695501733, "language_loss": 0.58226871, "learning_rate": 2.6812781515214742e-06, "loss": 0.6024543, "num_input_tokens_seen": 139039035, "router_z_loss_clip": 0.01422119, "router_z_loss_mlp": 0.09765625, "step": 6477, "time_per_iteration": 3.0317206382751465 }, { "auxiliary_loss_clip": 0.01071261, "auxiliary_loss_mlp": 0.01029629, "balance_loss_clip": 1.01383948, "balance_loss_mlp": 1.02180839, "epoch": 0.38947843078310534, "flos": 18546630531840.0, "grad_norm": 2.5432454095304675, "language_loss": 0.78060055, "learning_rate": 2.680922913028895e-06, "loss": 0.8016094, "num_input_tokens_seen": 139055560, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.49414062, "step": 6478, "time_per_iteration": 2.367189884185791 }, { "auxiliary_loss_clip": 0.01068925, "auxiliary_loss_mlp": 0.01030842, "balance_loss_clip": 1.01638794, "balance_loss_mlp": 1.02023172, "epoch": 0.3895385540357733, "flos": 14605145902080.0, "grad_norm": 2.1643034659257996, "language_loss": 0.82361877, "learning_rate": 2.680567650235929e-06, "loss": 0.84461641, "num_input_tokens_seen": 139071865, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.48632812, "step": 6479, "time_per_iteration": 2.361206531524658 }, { "auxiliary_loss_clip": 0.01068606, "auxiliary_loss_mlp": 0.01027348, "balance_loss_clip": 1.01347816, "balance_loss_mlp": 1.02187848, "epoch": 0.38959867728844133, "flos": 19974159002880.0, "grad_norm": 1.6593907654725017, "language_loss": 0.79844761, "learning_rate": 2.680212363155254e-06, "loss": 0.81940711, "num_input_tokens_seen": 139089640, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.46679688, "step": 6480, "time_per_iteration": 2.387129545211792 }, { "auxiliary_loss_clip": 0.01068032, "auxiliary_loss_mlp": 0.01025764, "balance_loss_clip": 1.01264453, "balance_loss_mlp": 1.02230835, "epoch": 0.3896588005411093, "flos": 22671094930560.0, "grad_norm": 1.6120260428091473, "language_loss": 0.83225536, "learning_rate": 2.6798570517995505e-06, "loss": 0.85319334, "num_input_tokens_seen": 139109365, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.45703125, "step": 6481, "time_per_iteration": 2.4379022121429443 }, { "auxiliary_loss_clip": 0.0107096, "auxiliary_loss_mlp": 0.01029307, "balance_loss_clip": 1.01559782, "balance_loss_mlp": 1.02476382, "epoch": 0.38971892379377726, "flos": 20994984512640.0, "grad_norm": 1.5549126313373527, "language_loss": 0.75272417, "learning_rate": 2.679501716181497e-06, "loss": 0.77372682, "num_input_tokens_seen": 139128260, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.4609375, "step": 6482, "time_per_iteration": 2.407773733139038 }, { "auxiliary_loss_clip": 0.01070952, "auxiliary_loss_mlp": 0.01027525, "balance_loss_clip": 1.01332748, "balance_loss_mlp": 1.02251124, "epoch": 0.3897790470464452, "flos": 22527392307840.0, "grad_norm": 2.435021378516795, "language_loss": 0.78798187, "learning_rate": 2.6791463563137752e-06, "loss": 0.8089667, "num_input_tokens_seen": 139147315, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.484375, "step": 6483, "time_per_iteration": 2.4283652305603027 }, { "auxiliary_loss_clip": 0.01069997, "auxiliary_loss_mlp": 0.01024407, "balance_loss_clip": 1.00904083, "balance_loss_mlp": 1.02176023, "epoch": 0.3898391702991132, "flos": 26208809153280.0, "grad_norm": 1.4183277940049122, "language_loss": 0.80131119, "learning_rate": 2.6787909722090667e-06, "loss": 0.82225525, "num_input_tokens_seen": 139167270, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.48242188, "step": 6484, "time_per_iteration": 2.443831205368042 }, { "auxiliary_loss_clip": 0.01071077, "auxiliary_loss_mlp": 0.01031662, "balance_loss_clip": 1.01605761, "balance_loss_mlp": 1.02364087, "epoch": 0.38989929355178116, "flos": 21064601496960.0, "grad_norm": 1.627917343405308, "language_loss": 0.78003567, "learning_rate": 2.6784355638800545e-06, "loss": 0.80106306, "num_input_tokens_seen": 139185970, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.47460938, "step": 6485, "time_per_iteration": 2.4115326404571533 }, { "auxiliary_loss_clip": 0.01071824, "auxiliary_loss_mlp": 0.01033282, "balance_loss_clip": 1.01825571, "balance_loss_mlp": 1.02301431, "epoch": 0.3899594168044491, "flos": 25482929742720.0, "grad_norm": 2.4352871616528615, "language_loss": 0.84839928, "learning_rate": 2.6780801313394225e-06, "loss": 0.86945033, "num_input_tokens_seen": 139203730, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.48828125, "step": 6486, "time_per_iteration": 2.4192097187042236 }, { "auxiliary_loss_clip": 0.01072469, "auxiliary_loss_mlp": 0.01027385, "balance_loss_clip": 1.01297235, "balance_loss_mlp": 1.02232361, "epoch": 0.3900195400571171, "flos": 31138021457280.0, "grad_norm": 1.8294198896871836, "language_loss": 0.85289669, "learning_rate": 2.677724674599854e-06, "loss": 0.87389517, "num_input_tokens_seen": 139222560, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.5, "step": 6487, "time_per_iteration": 2.457425832748413 }, { "auxiliary_loss_clip": 0.01071129, "auxiliary_loss_mlp": 0.01026888, "balance_loss_clip": 1.01184368, "balance_loss_mlp": 1.02410769, "epoch": 0.39007966330978505, "flos": 20228885349120.0, "grad_norm": 1.4305199433096794, "language_loss": 0.72924948, "learning_rate": 2.6773691936740357e-06, "loss": 0.75022966, "num_input_tokens_seen": 139242165, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.46875, "step": 6488, "time_per_iteration": 2.3979246616363525 }, { "auxiliary_loss_clip": 0.01072582, "auxiliary_loss_mlp": 0.01028207, "balance_loss_clip": 1.01380038, "balance_loss_mlp": 1.02384758, "epoch": 0.390139786562453, "flos": 22527636687360.0, "grad_norm": 1.8267719893420327, "language_loss": 0.68645287, "learning_rate": 2.677013688574654e-06, "loss": 0.70746076, "num_input_tokens_seen": 139262525, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.48828125, "step": 6489, "time_per_iteration": 2.415444850921631 }, { "auxiliary_loss_clip": 0.01068445, "auxiliary_loss_mlp": 0.0103154, "balance_loss_clip": 1.01842117, "balance_loss_mlp": 1.02287579, "epoch": 0.390199909815121, "flos": 26431694472960.0, "grad_norm": 2.277746125402378, "language_loss": 0.80612254, "learning_rate": 2.6766581593143937e-06, "loss": 0.82712239, "num_input_tokens_seen": 139282835, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.45703125, "step": 6490, "time_per_iteration": 2.4373369216918945 }, { "auxiliary_loss_clip": 0.01072244, "auxiliary_loss_mlp": 0.01028336, "balance_loss_clip": 1.01412678, "balance_loss_mlp": 1.0231123, "epoch": 0.39026003306778895, "flos": 17273627205120.0, "grad_norm": 2.084094265430606, "language_loss": 0.89501071, "learning_rate": 2.6763026059059455e-06, "loss": 0.91601658, "num_input_tokens_seen": 139299490, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.48828125, "step": 6491, "time_per_iteration": 2.3583133220672607 }, { "auxiliary_loss_clip": 0.01069856, "auxiliary_loss_mlp": 0.01028887, "balance_loss_clip": 1.01433182, "balance_loss_mlp": 1.02214003, "epoch": 0.3903201563204569, "flos": 24531756128640.0, "grad_norm": 1.6843933061451706, "language_loss": 0.78731203, "learning_rate": 2.675947028361996e-06, "loss": 0.80829942, "num_input_tokens_seen": 139317865, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.4765625, "step": 6492, "time_per_iteration": 2.4168667793273926 }, { "auxiliary_loss_clip": 0.0107176, "auxiliary_loss_mlp": 0.01029228, "balance_loss_clip": 1.01584113, "balance_loss_mlp": 1.02330852, "epoch": 0.39038027957312493, "flos": 23766843951360.0, "grad_norm": 1.692379312343689, "language_loss": 0.74323332, "learning_rate": 2.6755914266952365e-06, "loss": 0.76424325, "num_input_tokens_seen": 139339840, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.484375, "step": 6493, "time_per_iteration": 2.423197031021118 }, { "auxiliary_loss_clip": 0.01073855, "auxiliary_loss_mlp": 0.01031531, "balance_loss_clip": 1.01586103, "balance_loss_mlp": 1.02310073, "epoch": 0.3904404028257929, "flos": 14099742927360.0, "grad_norm": 1.78031682394404, "language_loss": 0.76241297, "learning_rate": 2.675235800918357e-06, "loss": 0.78346688, "num_input_tokens_seen": 139357555, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.5078125, "step": 6494, "time_per_iteration": 2.3686842918395996 }, { "auxiliary_loss_clip": 0.01076157, "auxiliary_loss_mlp": 0.01039687, "balance_loss_clip": 1.02342701, "balance_loss_mlp": 1.02274215, "epoch": 0.39050052607846086, "flos": 16909099032960.0, "grad_norm": 7.283111660615988, "language_loss": 0.74459386, "learning_rate": 2.6748801510440484e-06, "loss": 0.76575232, "num_input_tokens_seen": 139374455, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.53515625, "step": 6495, "time_per_iteration": 3.778712511062622 }, { "auxiliary_loss_clip": 0.01071609, "auxiliary_loss_mlp": 0.0103057, "balance_loss_clip": 1.01537085, "balance_loss_mlp": 1.02238202, "epoch": 0.39056064933112883, "flos": 25914735838080.0, "grad_norm": 1.683167137524962, "language_loss": 0.67795867, "learning_rate": 2.674524477085003e-06, "loss": 0.69898045, "num_input_tokens_seen": 139394770, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.4921875, "step": 6496, "time_per_iteration": 2.4624650478363037 }, { "auxiliary_loss_clip": 0.01011279, "auxiliary_loss_mlp": 0.0100104, "balance_loss_clip": 0.99935949, "balance_loss_mlp": 1.00167203, "epoch": 0.3906207725837968, "flos": 60025471119360.0, "grad_norm": 0.7020153734480111, "language_loss": 0.53949678, "learning_rate": 2.674168779053914e-06, "loss": 0.55962002, "num_input_tokens_seen": 139454760, "router_z_loss_clip": 0.0168457, "router_z_loss_mlp": 0.09619141, "step": 6497, "time_per_iteration": 3.1207151412963867 }, { "auxiliary_loss_clip": 0.01072483, "auxiliary_loss_mlp": 0.01030652, "balance_loss_clip": 1.01607883, "balance_loss_mlp": 1.02425528, "epoch": 0.39068089583646476, "flos": 21067638785280.0, "grad_norm": 1.9295321526756164, "language_loss": 0.68641782, "learning_rate": 2.6738130569634763e-06, "loss": 0.7074492, "num_input_tokens_seen": 139472645, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.48242188, "step": 6498, "time_per_iteration": 2.3912081718444824 }, { "auxiliary_loss_clip": 0.010112, "auxiliary_loss_mlp": 0.01000682, "balance_loss_clip": 0.99929941, "balance_loss_mlp": 1.00132179, "epoch": 0.3907410190891327, "flos": 70441911987840.0, "grad_norm": 0.7281905591079403, "language_loss": 0.51770073, "learning_rate": 2.673457310826383e-06, "loss": 0.53781956, "num_input_tokens_seen": 139536730, "router_z_loss_clip": 0.01385498, "router_z_loss_mlp": 0.09863281, "step": 6499, "time_per_iteration": 4.466989040374756 }, { "auxiliary_loss_clip": 0.010724, "auxiliary_loss_mlp": 0.01037923, "balance_loss_clip": 1.02064979, "balance_loss_mlp": 1.02197552, "epoch": 0.3908011423418007, "flos": 27961274448000.0, "grad_norm": 1.6407594440968745, "language_loss": 0.73890769, "learning_rate": 2.673101540655331e-06, "loss": 0.76001096, "num_input_tokens_seen": 139557540, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.50390625, "step": 6500, "time_per_iteration": 2.4607081413269043 }, { "auxiliary_loss_clip": 0.01071294, "auxiliary_loss_mlp": 0.01029275, "balance_loss_clip": 1.01432037, "balance_loss_mlp": 1.02248883, "epoch": 0.39086126559446865, "flos": 24460952158080.0, "grad_norm": 2.075601751373423, "language_loss": 0.69014835, "learning_rate": 2.6727457464630166e-06, "loss": 0.71115398, "num_input_tokens_seen": 139576875, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.48828125, "step": 6501, "time_per_iteration": 2.419100522994995 }, { "auxiliary_loss_clip": 0.01071081, "auxiliary_loss_mlp": 0.01033222, "balance_loss_clip": 1.01877975, "balance_loss_mlp": 1.02292275, "epoch": 0.3909213888471366, "flos": 16940730591360.0, "grad_norm": 1.660495213771565, "language_loss": 0.78816283, "learning_rate": 2.6723899282621363e-06, "loss": 0.80920589, "num_input_tokens_seen": 139594295, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.48242188, "step": 6502, "time_per_iteration": 3.780056953430176 }, { "auxiliary_loss_clip": 0.01070883, "auxiliary_loss_mlp": 0.01028311, "balance_loss_clip": 1.01530504, "balance_loss_mlp": 1.02517021, "epoch": 0.3909815120998046, "flos": 29277115879680.0, "grad_norm": 2.1044036887519217, "language_loss": 0.80444592, "learning_rate": 2.6720340860653894e-06, "loss": 0.82543778, "num_input_tokens_seen": 139614080, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.45703125, "step": 6503, "time_per_iteration": 2.4481537342071533 }, { "auxiliary_loss_clip": 0.01067127, "auxiliary_loss_mlp": 0.01027997, "balance_loss_clip": 1.01439512, "balance_loss_mlp": 1.02128088, "epoch": 0.39104163535247255, "flos": 18950296204800.0, "grad_norm": 5.435741648370807, "language_loss": 0.71616352, "learning_rate": 2.671678219885475e-06, "loss": 0.73711479, "num_input_tokens_seen": 139632755, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.45898438, "step": 6504, "time_per_iteration": 2.3827760219573975 }, { "auxiliary_loss_clip": 0.01070296, "auxiliary_loss_mlp": 0.01032959, "balance_loss_clip": 1.01871371, "balance_loss_mlp": 1.02241278, "epoch": 0.3911017586051405, "flos": 26322137026560.0, "grad_norm": 1.510870588813075, "language_loss": 0.83190632, "learning_rate": 2.6713223297350926e-06, "loss": 0.85293889, "num_input_tokens_seen": 139654205, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.48046875, "step": 6505, "time_per_iteration": 3.819749355316162 }, { "auxiliary_loss_clip": 0.01070381, "auxiliary_loss_mlp": 0.0103343, "balance_loss_clip": 1.01857638, "balance_loss_mlp": 1.02335227, "epoch": 0.3911618818578085, "flos": 21834680555520.0, "grad_norm": 3.53475962100524, "language_loss": 0.71180999, "learning_rate": 2.6709664156269426e-06, "loss": 0.73284805, "num_input_tokens_seen": 139673595, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.47070312, "step": 6506, "time_per_iteration": 2.4021599292755127 }, { "auxiliary_loss_clip": 0.01068826, "auxiliary_loss_mlp": 0.01033462, "balance_loss_clip": 1.01986074, "balance_loss_mlp": 1.02240252, "epoch": 0.3912220051104765, "flos": 16358833094400.0, "grad_norm": 2.1625273886153793, "language_loss": 0.75075412, "learning_rate": 2.670610477573727e-06, "loss": 0.77177703, "num_input_tokens_seen": 139690565, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.46484375, "step": 6507, "time_per_iteration": 2.361372947692871 }, { "auxiliary_loss_clip": 0.0101144, "auxiliary_loss_mlp": 0.01009372, "balance_loss_clip": 1.00803065, "balance_loss_mlp": 1.00197387, "epoch": 0.39128212836314447, "flos": 71047620898560.0, "grad_norm": 0.759223497609016, "language_loss": 0.56506526, "learning_rate": 2.670254515588149e-06, "loss": 0.58527339, "num_input_tokens_seen": 139749420, "router_z_loss_clip": 0.01342773, "router_z_loss_mlp": 0.09472656, "step": 6508, "time_per_iteration": 3.1868019104003906 }, { "auxiliary_loss_clip": 0.01070415, "auxiliary_loss_mlp": 0.01033676, "balance_loss_clip": 1.0200026, "balance_loss_mlp": 1.02263308, "epoch": 0.39134225161581243, "flos": 20331146321280.0, "grad_norm": 1.79345970067716, "language_loss": 0.75974876, "learning_rate": 2.6698985296829115e-06, "loss": 0.78078967, "num_input_tokens_seen": 139766265, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.4765625, "step": 6509, "time_per_iteration": 2.3881888389587402 }, { "auxiliary_loss_clip": 0.01070691, "auxiliary_loss_mlp": 0.01035217, "balance_loss_clip": 1.01974368, "balance_loss_mlp": 1.02108085, "epoch": 0.3914023748684804, "flos": 17017469493120.0, "grad_norm": 2.7341706514927857, "language_loss": 0.82551944, "learning_rate": 2.6695425198707187e-06, "loss": 0.84657848, "num_input_tokens_seen": 139782400, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.49804688, "step": 6510, "time_per_iteration": 2.360872268676758 }, { "auxiliary_loss_clip": 0.01070789, "auxiliary_loss_mlp": 0.01026773, "balance_loss_clip": 1.01234913, "balance_loss_mlp": 1.02283633, "epoch": 0.39146249812114836, "flos": 18404254540800.0, "grad_norm": 2.061524877530665, "language_loss": 0.76299548, "learning_rate": 2.669186486164276e-06, "loss": 0.78397119, "num_input_tokens_seen": 139801435, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.48046875, "step": 6511, "time_per_iteration": 2.374729871749878 }, { "auxiliary_loss_clip": 0.01010949, "auxiliary_loss_mlp": 0.01003806, "balance_loss_clip": 1.00247073, "balance_loss_mlp": 1.00114608, "epoch": 0.3915226213738163, "flos": 67633638134400.0, "grad_norm": 0.7227842401470164, "language_loss": 0.57769883, "learning_rate": 2.6688304285762878e-06, "loss": 0.59784639, "num_input_tokens_seen": 139869700, "router_z_loss_clip": 0.0133667, "router_z_loss_mlp": 0.09814453, "step": 6512, "time_per_iteration": 3.1133248805999756 }, { "auxiliary_loss_clip": 0.01072146, "auxiliary_loss_mlp": 0.01030807, "balance_loss_clip": 1.01521444, "balance_loss_mlp": 1.02253473, "epoch": 0.3915827446264843, "flos": 26358132504960.0, "grad_norm": 1.6924279689360266, "language_loss": 0.69814038, "learning_rate": 2.6684743471194627e-06, "loss": 0.71916991, "num_input_tokens_seen": 139890140, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.49609375, "step": 6513, "time_per_iteration": 2.425922393798828 }, { "auxiliary_loss_clip": 0.01073337, "auxiliary_loss_mlp": 0.01030735, "balance_loss_clip": 1.01479053, "balance_loss_mlp": 1.02243829, "epoch": 0.39164286787915226, "flos": 21942841547520.0, "grad_norm": 4.158661308402621, "language_loss": 0.75745928, "learning_rate": 2.668118241806508e-06, "loss": 0.77849996, "num_input_tokens_seen": 139908020, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.5078125, "step": 6514, "time_per_iteration": 2.3662989139556885 }, { "auxiliary_loss_clip": 0.0107254, "auxiliary_loss_mlp": 0.0102883, "balance_loss_clip": 1.01482892, "balance_loss_mlp": 1.02344418, "epoch": 0.3917029911318202, "flos": 16398878290560.0, "grad_norm": 1.9568285385612725, "language_loss": 0.77076769, "learning_rate": 2.6677621126501316e-06, "loss": 0.79178137, "num_input_tokens_seen": 139926180, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.4921875, "step": 6515, "time_per_iteration": 2.4456069469451904 }, { "auxiliary_loss_clip": 0.01068817, "auxiliary_loss_mlp": 0.01025696, "balance_loss_clip": 1.0126667, "balance_loss_mlp": 1.02213216, "epoch": 0.3917631143844882, "flos": 26210554721280.0, "grad_norm": 1.3565878970265306, "language_loss": 0.80078703, "learning_rate": 2.667405959663043e-06, "loss": 0.82173216, "num_input_tokens_seen": 139947420, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.46679688, "step": 6516, "time_per_iteration": 2.4280171394348145 }, { "auxiliary_loss_clip": 0.01072937, "auxiliary_loss_mlp": 0.01029463, "balance_loss_clip": 1.01500273, "balance_loss_mlp": 1.02349448, "epoch": 0.39182323763715615, "flos": 18547468404480.0, "grad_norm": 2.2096037200594503, "language_loss": 0.70288467, "learning_rate": 2.667049782857952e-06, "loss": 0.72390866, "num_input_tokens_seen": 139965800, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.49414062, "step": 6517, "time_per_iteration": 2.3704264163970947 }, { "auxiliary_loss_clip": 0.01070886, "auxiliary_loss_mlp": 0.01032609, "balance_loss_clip": 1.01746893, "balance_loss_mlp": 1.02152848, "epoch": 0.3918833608898241, "flos": 34312115203200.0, "grad_norm": 1.809158599947749, "language_loss": 0.7184099, "learning_rate": 2.666693582247571e-06, "loss": 0.73944485, "num_input_tokens_seen": 139988140, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.49414062, "step": 6518, "time_per_iteration": 2.4997715950012207 }, { "auxiliary_loss_clip": 0.010734, "auxiliary_loss_mlp": 0.01029901, "balance_loss_clip": 1.01490474, "balance_loss_mlp": 1.02406752, "epoch": 0.3919434841424921, "flos": 36938107514880.0, "grad_norm": 1.5865486953082641, "language_loss": 0.61552501, "learning_rate": 2.66633735784461e-06, "loss": 0.63655806, "num_input_tokens_seen": 140010060, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.4921875, "step": 6519, "time_per_iteration": 2.522475004196167 }, { "auxiliary_loss_clip": 0.0107778, "auxiliary_loss_mlp": 0.01035704, "balance_loss_clip": 1.01977742, "balance_loss_mlp": 1.02490759, "epoch": 0.3920036073951601, "flos": 23507963153280.0, "grad_norm": 2.165308211753418, "language_loss": 0.67055762, "learning_rate": 2.665981109661784e-06, "loss": 0.69169247, "num_input_tokens_seen": 140029400, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.52734375, "step": 6520, "time_per_iteration": 2.4056036472320557 }, { "auxiliary_loss_clip": 0.01070733, "auxiliary_loss_mlp": 0.01028202, "balance_loss_clip": 1.01427794, "balance_loss_mlp": 1.02329373, "epoch": 0.39206373064782807, "flos": 18405092413440.0, "grad_norm": 1.7036751181350607, "language_loss": 0.78544468, "learning_rate": 2.6656248377118043e-06, "loss": 0.80643404, "num_input_tokens_seen": 140048940, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.47265625, "step": 6521, "time_per_iteration": 2.388385534286499 }, { "auxiliary_loss_clip": 0.0107627, "auxiliary_loss_mlp": 0.01031928, "balance_loss_clip": 1.01412439, "balance_loss_mlp": 1.02421784, "epoch": 0.39212385390049603, "flos": 12312224760960.0, "grad_norm": 2.1774890338730417, "language_loss": 0.69795561, "learning_rate": 2.6652685420073867e-06, "loss": 0.71903753, "num_input_tokens_seen": 140066380, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.51953125, "step": 6522, "time_per_iteration": 2.3660755157470703 }, { "auxiliary_loss_clip": 0.01073926, "auxiliary_loss_mlp": 0.01030369, "balance_loss_clip": 1.01585519, "balance_loss_mlp": 1.02330637, "epoch": 0.392183977153164, "flos": 19718140936320.0, "grad_norm": 1.8100435821216008, "language_loss": 0.7645672, "learning_rate": 2.664912222561246e-06, "loss": 0.78561014, "num_input_tokens_seen": 140085275, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.5078125, "step": 6523, "time_per_iteration": 2.380765676498413 }, { "auxiliary_loss_clip": 0.01073468, "auxiliary_loss_mlp": 0.01028199, "balance_loss_clip": 1.01300025, "balance_loss_mlp": 1.02377224, "epoch": 0.39224410040583196, "flos": 33143537352960.0, "grad_norm": 2.19565243827638, "language_loss": 0.62009531, "learning_rate": 2.664555879386098e-06, "loss": 0.64111197, "num_input_tokens_seen": 140105105, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.49609375, "step": 6524, "time_per_iteration": 2.48087739944458 }, { "auxiliary_loss_clip": 0.0107139, "auxiliary_loss_mlp": 0.0102822, "balance_loss_clip": 1.0127461, "balance_loss_mlp": 1.02229893, "epoch": 0.39230422365849993, "flos": 27781192321920.0, "grad_norm": 1.7799679359940686, "language_loss": 0.74032056, "learning_rate": 2.6641995124946606e-06, "loss": 0.7613166, "num_input_tokens_seen": 140125645, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.49023438, "step": 6525, "time_per_iteration": 2.4335601329803467 }, { "auxiliary_loss_clip": 0.01072296, "auxiliary_loss_mlp": 0.01030525, "balance_loss_clip": 1.01657772, "balance_loss_mlp": 1.02280092, "epoch": 0.3923643469111679, "flos": 17930657681280.0, "grad_norm": 1.922923076222103, "language_loss": 0.81455332, "learning_rate": 2.6638431218996517e-06, "loss": 0.83558154, "num_input_tokens_seen": 140141925, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.49609375, "step": 6526, "time_per_iteration": 2.3506667613983154 }, { "auxiliary_loss_clip": 0.01072488, "auxiliary_loss_mlp": 0.01027286, "balance_loss_clip": 1.01310003, "balance_loss_mlp": 1.02294397, "epoch": 0.39242447016383586, "flos": 24058438560000.0, "grad_norm": 1.7242206333312153, "language_loss": 0.69962192, "learning_rate": 2.6634867076137886e-06, "loss": 0.72061968, "num_input_tokens_seen": 140160965, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.49609375, "step": 6527, "time_per_iteration": 2.414858818054199 }, { "auxiliary_loss_clip": 0.01069636, "auxiliary_loss_mlp": 0.01031325, "balance_loss_clip": 1.01660883, "balance_loss_mlp": 1.02246583, "epoch": 0.3924845934165038, "flos": 10663486715520.0, "grad_norm": 3.0285435867831803, "language_loss": 0.82174188, "learning_rate": 2.663130269649792e-06, "loss": 0.8427515, "num_input_tokens_seen": 140177780, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.47265625, "step": 6528, "time_per_iteration": 2.350456714630127 }, { "auxiliary_loss_clip": 0.01071739, "auxiliary_loss_mlp": 0.01029933, "balance_loss_clip": 1.01494205, "balance_loss_mlp": 1.02350736, "epoch": 0.3925447166691718, "flos": 31244646349440.0, "grad_norm": 1.5841101196191865, "language_loss": 0.68338889, "learning_rate": 2.6627738080203817e-06, "loss": 0.70440561, "num_input_tokens_seen": 140201660, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.48242188, "step": 6529, "time_per_iteration": 2.492323875427246 }, { "auxiliary_loss_clip": 0.01074034, "auxiliary_loss_mlp": 0.01031782, "balance_loss_clip": 1.01621377, "balance_loss_mlp": 1.02381659, "epoch": 0.39260483992183975, "flos": 29414010787200.0, "grad_norm": 2.4145039296146105, "language_loss": 0.8057965, "learning_rate": 2.662417322738279e-06, "loss": 0.82685471, "num_input_tokens_seen": 140218585, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.50390625, "step": 6530, "time_per_iteration": 2.416611433029175 }, { "auxiliary_loss_clip": 0.01070743, "auxiliary_loss_mlp": 0.01029347, "balance_loss_clip": 1.01610875, "balance_loss_mlp": 1.02235818, "epoch": 0.3926649631745077, "flos": 22856658140160.0, "grad_norm": 1.4002561573666452, "language_loss": 0.75478733, "learning_rate": 2.6620608138162055e-06, "loss": 0.77578831, "num_input_tokens_seen": 140239905, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.484375, "step": 6531, "time_per_iteration": 2.4023075103759766 }, { "auxiliary_loss_clip": 0.01011892, "auxiliary_loss_mlp": 0.01001146, "balance_loss_clip": 0.99988264, "balance_loss_mlp": 1.00243986, "epoch": 0.3927250864271757, "flos": 63890880163200.0, "grad_norm": 0.8061435048317566, "language_loss": 0.60290277, "learning_rate": 2.6617042812668857e-06, "loss": 0.62303311, "num_input_tokens_seen": 140293820, "router_z_loss_clip": 0.01263428, "router_z_loss_mlp": 0.09472656, "step": 6532, "time_per_iteration": 2.8868446350097656 }, { "auxiliary_loss_clip": 0.01011789, "auxiliary_loss_mlp": 0.01000566, "balance_loss_clip": 0.99911773, "balance_loss_mlp": 1.00211549, "epoch": 0.3927852096798437, "flos": 68906117790720.0, "grad_norm": 0.7726829765055597, "language_loss": 0.554878, "learning_rate": 2.661347725103041e-06, "loss": 0.57500154, "num_input_tokens_seen": 140360420, "router_z_loss_clip": 0.01446533, "router_z_loss_mlp": 0.09667969, "step": 6533, "time_per_iteration": 3.139435291290283 }, { "auxiliary_loss_clip": 0.0107725, "auxiliary_loss_mlp": 0.0103213, "balance_loss_clip": 1.01665688, "balance_loss_mlp": 1.02562857, "epoch": 0.39284533293251167, "flos": 29714682349440.0, "grad_norm": 1.8939754286522168, "language_loss": 0.76383758, "learning_rate": 2.6609911453373978e-06, "loss": 0.78493142, "num_input_tokens_seen": 140381950, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.515625, "step": 6534, "time_per_iteration": 3.886058807373047 }, { "auxiliary_loss_clip": 0.01074632, "auxiliary_loss_mlp": 0.01034021, "balance_loss_clip": 1.01887536, "balance_loss_mlp": 1.02283192, "epoch": 0.39290545618517964, "flos": 18551029363200.0, "grad_norm": 2.4519466383752313, "language_loss": 0.78075075, "learning_rate": 2.660634541982681e-06, "loss": 0.80183721, "num_input_tokens_seen": 140399410, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.51953125, "step": 6535, "time_per_iteration": 2.3610856533050537 }, { "auxiliary_loss_clip": 0.0107189, "auxiliary_loss_mlp": 0.01026046, "balance_loss_clip": 1.01236725, "balance_loss_mlp": 1.02398586, "epoch": 0.3929655794378476, "flos": 26248295767680.0, "grad_norm": 1.9195857503071385, "language_loss": 0.69086009, "learning_rate": 2.6602779150516163e-06, "loss": 0.7118395, "num_input_tokens_seen": 140419055, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.48046875, "step": 6536, "time_per_iteration": 2.4630112648010254 }, { "auxiliary_loss_clip": 0.01068108, "auxiliary_loss_mlp": 0.0102794, "balance_loss_clip": 1.01433218, "balance_loss_mlp": 1.02199149, "epoch": 0.39302570269051557, "flos": 29276662032000.0, "grad_norm": 1.7973021879875843, "language_loss": 0.69352496, "learning_rate": 2.6599212645569316e-06, "loss": 0.71448541, "num_input_tokens_seen": 140438800, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.4609375, "step": 6537, "time_per_iteration": 2.4424188137054443 }, { "auxiliary_loss_clip": 0.01074702, "auxiliary_loss_mlp": 0.01030436, "balance_loss_clip": 1.01601124, "balance_loss_mlp": 1.02404118, "epoch": 0.39308582594318353, "flos": 17346490945920.0, "grad_norm": 1.6493351350649215, "language_loss": 0.78746736, "learning_rate": 2.6595645905113546e-06, "loss": 0.80851877, "num_input_tokens_seen": 140456880, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.5078125, "step": 6538, "time_per_iteration": 2.366671562194824 }, { "auxiliary_loss_clip": 0.01011739, "auxiliary_loss_mlp": 0.01002507, "balance_loss_clip": 1.00125504, "balance_loss_mlp": 1.00184202, "epoch": 0.3931459491958515, "flos": 61004296396800.0, "grad_norm": 0.8007630957422047, "language_loss": 0.61874413, "learning_rate": 2.659207892927614e-06, "loss": 0.63888663, "num_input_tokens_seen": 140507510, "router_z_loss_clip": 0.01251221, "router_z_loss_mlp": 0.09912109, "step": 6539, "time_per_iteration": 4.198698282241821 }, { "auxiliary_loss_clip": 0.01073834, "auxiliary_loss_mlp": 0.01029192, "balance_loss_clip": 1.01349247, "balance_loss_mlp": 1.0240736, "epoch": 0.39320607244851946, "flos": 39014567026560.0, "grad_norm": 1.9675065114053225, "language_loss": 0.68048406, "learning_rate": 2.658851171818439e-06, "loss": 0.70151436, "num_input_tokens_seen": 140528740, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.49804688, "step": 6540, "time_per_iteration": 2.5468459129333496 }, { "auxiliary_loss_clip": 0.0107159, "auxiliary_loss_mlp": 0.01028758, "balance_loss_clip": 1.01441133, "balance_loss_mlp": 1.02308488, "epoch": 0.3932661957011874, "flos": 24678635685120.0, "grad_norm": 1.7055337015864176, "language_loss": 0.72762537, "learning_rate": 2.65849442719656e-06, "loss": 0.7486288, "num_input_tokens_seen": 140547560, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.484375, "step": 6541, "time_per_iteration": 3.7921223640441895 }, { "auxiliary_loss_clip": 0.01011855, "auxiliary_loss_mlp": 0.0100478, "balance_loss_clip": 1.00346291, "balance_loss_mlp": 1.00177538, "epoch": 0.3933263189538554, "flos": 70093375084800.0, "grad_norm": 0.8597354786466657, "language_loss": 0.60322255, "learning_rate": 2.65813765907471e-06, "loss": 0.62338883, "num_input_tokens_seen": 140601175, "router_z_loss_clip": 0.01318359, "router_z_loss_mlp": 0.10058594, "step": 6542, "time_per_iteration": 2.870962619781494 }, { "auxiliary_loss_clip": 0.01072311, "auxiliary_loss_mlp": 0.01033129, "balance_loss_clip": 1.01869273, "balance_loss_mlp": 1.02378333, "epoch": 0.39338644220652336, "flos": 22927985781120.0, "grad_norm": 1.4538670705019117, "language_loss": 0.82157886, "learning_rate": 2.657780867465619e-06, "loss": 0.84263325, "num_input_tokens_seen": 140622200, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.484375, "step": 6543, "time_per_iteration": 2.408843994140625 }, { "auxiliary_loss_clip": 0.0107018, "auxiliary_loss_mlp": 0.0102878, "balance_loss_clip": 1.01434934, "balance_loss_mlp": 1.02141893, "epoch": 0.3934465654591913, "flos": 30846810873600.0, "grad_norm": 1.4830334299248502, "language_loss": 0.68879461, "learning_rate": 2.6574240523820214e-06, "loss": 0.70978415, "num_input_tokens_seen": 140643125, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.48828125, "step": 6544, "time_per_iteration": 3.8891849517822266 }, { "auxiliary_loss_clip": 0.01074671, "auxiliary_loss_mlp": 0.0103487, "balance_loss_clip": 1.01855075, "balance_loss_mlp": 1.02308834, "epoch": 0.3935066887118593, "flos": 29235394938240.0, "grad_norm": 2.233016669922964, "language_loss": 0.75380683, "learning_rate": 2.6570672138366503e-06, "loss": 0.77490222, "num_input_tokens_seen": 140662500, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.515625, "step": 6545, "time_per_iteration": 2.4544827938079834 }, { "auxiliary_loss_clip": 0.01070354, "auxiliary_loss_mlp": 0.01032217, "balance_loss_clip": 1.01915145, "balance_loss_mlp": 1.02438796, "epoch": 0.3935668119645273, "flos": 19134288403200.0, "grad_norm": 1.375636981667089, "language_loss": 0.74318087, "learning_rate": 2.65671035184224e-06, "loss": 0.76420653, "num_input_tokens_seen": 140681960, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.4609375, "step": 6546, "time_per_iteration": 2.3854589462280273 }, { "auxiliary_loss_clip": 0.01074191, "auxiliary_loss_mlp": 0.01033753, "balance_loss_clip": 1.01900089, "balance_loss_mlp": 1.02254164, "epoch": 0.3936269352171953, "flos": 18515103707520.0, "grad_norm": 2.160252578989639, "language_loss": 0.81589425, "learning_rate": 2.656353466411527e-06, "loss": 0.83697367, "num_input_tokens_seen": 140699170, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.515625, "step": 6547, "time_per_iteration": 2.370854139328003 }, { "auxiliary_loss_clip": 0.01071497, "auxiliary_loss_mlp": 0.01028708, "balance_loss_clip": 1.01473093, "balance_loss_mlp": 1.02171779, "epoch": 0.39368705846986324, "flos": 15631906343040.0, "grad_norm": 2.0570765177892136, "language_loss": 0.84051442, "learning_rate": 2.6559965575572475e-06, "loss": 0.86151642, "num_input_tokens_seen": 140714920, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.49609375, "step": 6548, "time_per_iteration": 2.3534698486328125 }, { "auxiliary_loss_clip": 0.01068807, "auxiliary_loss_mlp": 0.01024729, "balance_loss_clip": 1.01131225, "balance_loss_mlp": 1.02213502, "epoch": 0.3937471817225312, "flos": 21324739104000.0, "grad_norm": 1.456044065726379, "language_loss": 0.72929382, "learning_rate": 2.6556396252921375e-06, "loss": 0.75022912, "num_input_tokens_seen": 140734595, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.46679688, "step": 6549, "time_per_iteration": 2.414046049118042 }, { "auxiliary_loss_clip": 0.01074092, "auxiliary_loss_mlp": 0.01030853, "balance_loss_clip": 1.01500392, "balance_loss_mlp": 1.0246824, "epoch": 0.39380730497519917, "flos": 20775660151680.0, "grad_norm": 1.8810851407891063, "language_loss": 0.7762537, "learning_rate": 2.6552826696289363e-06, "loss": 0.79730314, "num_input_tokens_seen": 140754050, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.4921875, "step": 6550, "time_per_iteration": 2.393656015396118 }, { "auxiliary_loss_clip": 0.01070584, "auxiliary_loss_mlp": 0.01026359, "balance_loss_clip": 1.01179183, "balance_loss_mlp": 1.02308702, "epoch": 0.39386742822786713, "flos": 21608897592960.0, "grad_norm": 1.8924948801528723, "language_loss": 0.81007159, "learning_rate": 2.6549256905803815e-06, "loss": 0.83104104, "num_input_tokens_seen": 140771440, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.47460938, "step": 6551, "time_per_iteration": 2.3955979347229004 }, { "auxiliary_loss_clip": 0.0107367, "auxiliary_loss_mlp": 0.0103241, "balance_loss_clip": 1.01774752, "balance_loss_mlp": 1.02270997, "epoch": 0.3939275514805351, "flos": 12414031885440.0, "grad_norm": 2.706361055961179, "language_loss": 0.79950058, "learning_rate": 2.654568688159214e-06, "loss": 0.82056135, "num_input_tokens_seen": 140786715, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.5078125, "step": 6552, "time_per_iteration": 2.3435208797454834 }, { "auxiliary_loss_clip": 0.01072476, "auxiliary_loss_mlp": 0.01033191, "balance_loss_clip": 1.01882601, "balance_loss_mlp": 1.02324486, "epoch": 0.39398767473320306, "flos": 18551029363200.0, "grad_norm": 2.3230211960388516, "language_loss": 0.71203274, "learning_rate": 2.6542116623781736e-06, "loss": 0.73308933, "num_input_tokens_seen": 140804950, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.4921875, "step": 6553, "time_per_iteration": 2.4161274433135986 }, { "auxiliary_loss_clip": 0.01072341, "auxiliary_loss_mlp": 0.01033597, "balance_loss_clip": 1.01913667, "balance_loss_mlp": 1.02345705, "epoch": 0.39404779798587103, "flos": 29307769920000.0, "grad_norm": 2.607946335846458, "language_loss": 0.6435535, "learning_rate": 2.6538546132500023e-06, "loss": 0.66461289, "num_input_tokens_seen": 140822800, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.48828125, "step": 6554, "time_per_iteration": 2.4407150745391846 }, { "auxiliary_loss_clip": 0.01073041, "auxiliary_loss_mlp": 0.01037871, "balance_loss_clip": 1.02342331, "balance_loss_mlp": 1.02415478, "epoch": 0.394107921238539, "flos": 34895618622720.0, "grad_norm": 1.8755209423884256, "language_loss": 0.79221523, "learning_rate": 2.6534975407874417e-06, "loss": 0.81332433, "num_input_tokens_seen": 140842940, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.49023438, "step": 6555, "time_per_iteration": 2.5411691665649414 }, { "auxiliary_loss_clip": 0.01075385, "auxiliary_loss_mlp": 0.01034733, "balance_loss_clip": 1.01912856, "balance_loss_mlp": 1.02499366, "epoch": 0.39416804449120696, "flos": 25080276499200.0, "grad_norm": 1.7085186996768669, "language_loss": 0.71265376, "learning_rate": 2.653140445003234e-06, "loss": 0.73375487, "num_input_tokens_seen": 140863060, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.50390625, "step": 6556, "time_per_iteration": 2.4155921936035156 }, { "auxiliary_loss_clip": 0.01072876, "auxiliary_loss_mlp": 0.01023499, "balance_loss_clip": 1.00914669, "balance_loss_mlp": 1.02310085, "epoch": 0.3942281677438749, "flos": 32305272675840.0, "grad_norm": 1.7635482576620043, "language_loss": 0.83707261, "learning_rate": 2.652783325910125e-06, "loss": 0.85803634, "num_input_tokens_seen": 140883795, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.49804688, "step": 6557, "time_per_iteration": 2.5170037746429443 }, { "auxiliary_loss_clip": 0.01073957, "auxiliary_loss_mlp": 0.01026951, "balance_loss_clip": 1.01318264, "balance_loss_mlp": 1.02437162, "epoch": 0.3942882909965429, "flos": 24935456712960.0, "grad_norm": 4.2244640702949, "language_loss": 0.80184871, "learning_rate": 2.652426183520857e-06, "loss": 0.82285774, "num_input_tokens_seen": 140903055, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.49609375, "step": 6558, "time_per_iteration": 2.408504009246826 }, { "auxiliary_loss_clip": 0.01070894, "auxiliary_loss_mlp": 0.01028135, "balance_loss_clip": 1.01507616, "balance_loss_mlp": 1.02291012, "epoch": 0.39434841424921085, "flos": 11873994975360.0, "grad_norm": 1.7397184575560158, "language_loss": 0.70887214, "learning_rate": 2.652069017848178e-06, "loss": 0.72986245, "num_input_tokens_seen": 140920685, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.48046875, "step": 6559, "time_per_iteration": 2.380168914794922 }, { "auxiliary_loss_clip": 0.01074489, "auxiliary_loss_mlp": 0.0103316, "balance_loss_clip": 1.01749015, "balance_loss_mlp": 1.02278757, "epoch": 0.3944085375018789, "flos": 16360718307840.0, "grad_norm": 1.9244022930595914, "language_loss": 0.80489105, "learning_rate": 2.651711828904833e-06, "loss": 0.82596755, "num_input_tokens_seen": 140937320, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.515625, "step": 6560, "time_per_iteration": 2.408808708190918 }, { "auxiliary_loss_clip": 0.01074947, "auxiliary_loss_mlp": 0.01030504, "balance_loss_clip": 1.01499486, "balance_loss_mlp": 1.02466321, "epoch": 0.39446866075454684, "flos": 10632623207040.0, "grad_norm": 2.068328291681275, "language_loss": 0.83086205, "learning_rate": 2.6513546167035687e-06, "loss": 0.85191661, "num_input_tokens_seen": 140954855, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.50390625, "step": 6561, "time_per_iteration": 2.393876314163208 }, { "auxiliary_loss_clip": 0.01073276, "auxiliary_loss_mlp": 0.01027744, "balance_loss_clip": 1.01291418, "balance_loss_mlp": 1.02332056, "epoch": 0.3945287840072148, "flos": 18186501191040.0, "grad_norm": 2.4164623338452085, "language_loss": 0.79811245, "learning_rate": 2.6509973812571336e-06, "loss": 0.81912267, "num_input_tokens_seen": 140973250, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.5, "step": 6562, "time_per_iteration": 2.3570621013641357 }, { "auxiliary_loss_clip": 0.01069398, "auxiliary_loss_mlp": 0.01033221, "balance_loss_clip": 1.01935077, "balance_loss_mlp": 1.02289522, "epoch": 0.39458890725988277, "flos": 23038765125120.0, "grad_norm": 1.5110629373540265, "language_loss": 0.81414276, "learning_rate": 2.6506401225782763e-06, "loss": 0.83516896, "num_input_tokens_seen": 140993050, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.46484375, "step": 6563, "time_per_iteration": 2.4207420349121094 }, { "auxiliary_loss_clip": 0.0107185, "auxiliary_loss_mlp": 0.01034271, "balance_loss_clip": 1.01910758, "balance_loss_mlp": 1.02302861, "epoch": 0.39464903051255074, "flos": 17158274472960.0, "grad_norm": 3.81212947711729, "language_loss": 0.70077831, "learning_rate": 2.650282840679747e-06, "loss": 0.72183955, "num_input_tokens_seen": 141010815, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.48828125, "step": 6564, "time_per_iteration": 2.3709397315979004 }, { "auxiliary_loss_clip": 0.01074149, "auxiliary_loss_mlp": 0.0103804, "balance_loss_clip": 1.02231073, "balance_loss_mlp": 1.02338731, "epoch": 0.3947091537652187, "flos": 15888064055040.0, "grad_norm": 2.6621807915574287, "language_loss": 0.83101928, "learning_rate": 2.6499255355742966e-06, "loss": 0.85214114, "num_input_tokens_seen": 141028720, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.5078125, "step": 6565, "time_per_iteration": 2.383371114730835 }, { "auxiliary_loss_clip": 0.01072819, "auxiliary_loss_mlp": 0.01031104, "balance_loss_clip": 1.01696622, "balance_loss_mlp": 1.02392125, "epoch": 0.39476927701788667, "flos": 18544675495680.0, "grad_norm": 1.7362731446266715, "language_loss": 0.83571386, "learning_rate": 2.649568207274674e-06, "loss": 0.85675311, "num_input_tokens_seen": 141046025, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.48828125, "step": 6566, "time_per_iteration": 2.390942335128784 }, { "auxiliary_loss_clip": 0.01076011, "auxiliary_loss_mlp": 0.01033674, "balance_loss_clip": 1.01796269, "balance_loss_mlp": 1.02457607, "epoch": 0.39482940027055463, "flos": 22274551175040.0, "grad_norm": 1.628242175835704, "language_loss": 0.77361, "learning_rate": 2.649210855793634e-06, "loss": 0.79470682, "num_input_tokens_seen": 141066865, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.515625, "step": 6567, "time_per_iteration": 2.4575908184051514 }, { "auxiliary_loss_clip": 0.01068418, "auxiliary_loss_mlp": 0.01030884, "balance_loss_clip": 1.01729381, "balance_loss_mlp": 1.02343106, "epoch": 0.3948895235232226, "flos": 14756738492160.0, "grad_norm": 1.8477315175351015, "language_loss": 0.80487537, "learning_rate": 2.648853481143928e-06, "loss": 0.82586837, "num_input_tokens_seen": 141084210, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.44921875, "step": 6568, "time_per_iteration": 2.340587615966797 }, { "auxiliary_loss_clip": 0.01072536, "auxiliary_loss_mlp": 0.01028241, "balance_loss_clip": 1.01344717, "balance_loss_mlp": 1.02351689, "epoch": 0.39494964677589056, "flos": 22564644595200.0, "grad_norm": 1.6683448632107267, "language_loss": 0.84761685, "learning_rate": 2.648496083338311e-06, "loss": 0.86862463, "num_input_tokens_seen": 141103895, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.49023438, "step": 6569, "time_per_iteration": 2.4029245376586914 }, { "auxiliary_loss_clip": 0.01073176, "auxiliary_loss_mlp": 0.01029803, "balance_loss_clip": 1.0162673, "balance_loss_mlp": 1.02544475, "epoch": 0.3950097700285585, "flos": 22962165868800.0, "grad_norm": 2.29149063381115, "language_loss": 0.74519515, "learning_rate": 2.648138662389537e-06, "loss": 0.76622492, "num_input_tokens_seen": 141124000, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.47851562, "step": 6570, "time_per_iteration": 2.4220478534698486 }, { "auxiliary_loss_clip": 0.01072221, "auxiliary_loss_mlp": 0.01026709, "balance_loss_clip": 1.01253557, "balance_loss_mlp": 1.02289999, "epoch": 0.3950698932812265, "flos": 20594181571200.0, "grad_norm": 2.5870159572169644, "language_loss": 0.79812562, "learning_rate": 2.6477812183103606e-06, "loss": 0.81911492, "num_input_tokens_seen": 141142535, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.49414062, "step": 6571, "time_per_iteration": 2.3925352096557617 }, { "auxiliary_loss_clip": 0.01071571, "auxiliary_loss_mlp": 0.01037539, "balance_loss_clip": 1.02297211, "balance_loss_mlp": 1.02379191, "epoch": 0.39513001653389446, "flos": 20374752476160.0, "grad_norm": 1.6882012571418306, "language_loss": 0.77988535, "learning_rate": 2.647423751113539e-06, "loss": 0.80097646, "num_input_tokens_seen": 141161575, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.4765625, "step": 6572, "time_per_iteration": 2.3814303874969482 }, { "auxiliary_loss_clip": 0.01073321, "auxiliary_loss_mlp": 0.01031301, "balance_loss_clip": 1.01555371, "balance_loss_mlp": 1.02300763, "epoch": 0.3951901397865625, "flos": 26462592892800.0, "grad_norm": 1.6707871094968614, "language_loss": 0.74629748, "learning_rate": 2.6470662608118294e-06, "loss": 0.7673437, "num_input_tokens_seen": 141181150, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.50390625, "step": 6573, "time_per_iteration": 3.8870034217834473 }, { "auxiliary_loss_clip": 0.01069885, "auxiliary_loss_mlp": 0.01027312, "balance_loss_clip": 1.01440132, "balance_loss_mlp": 1.02291834, "epoch": 0.39525026303923044, "flos": 43836595856640.0, "grad_norm": 1.6465136559072577, "language_loss": 0.68008959, "learning_rate": 2.64670874741799e-06, "loss": 0.70106161, "num_input_tokens_seen": 141206310, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.46875, "step": 6574, "time_per_iteration": 2.599647045135498 }, { "auxiliary_loss_clip": 0.01073978, "auxiliary_loss_mlp": 0.01027054, "balance_loss_clip": 1.01165223, "balance_loss_mlp": 1.02387023, "epoch": 0.3953103862918984, "flos": 18039831102720.0, "grad_norm": 2.3783593159200893, "language_loss": 0.71899128, "learning_rate": 2.6463512109447776e-06, "loss": 0.74000162, "num_input_tokens_seen": 141223925, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.5, "step": 6575, "time_per_iteration": 2.3828279972076416 }, { "auxiliary_loss_clip": 0.01073082, "auxiliary_loss_mlp": 0.01034518, "balance_loss_clip": 1.01983726, "balance_loss_mlp": 1.02329433, "epoch": 0.3953705095445664, "flos": 16975259792640.0, "grad_norm": 1.7801197347859303, "language_loss": 0.73074341, "learning_rate": 2.645993651404954e-06, "loss": 0.75181937, "num_input_tokens_seen": 141239010, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.49804688, "step": 6576, "time_per_iteration": 2.353875160217285 }, { "auxiliary_loss_clip": 0.01071715, "auxiliary_loss_mlp": 0.01030612, "balance_loss_clip": 1.01729655, "balance_loss_mlp": 1.02338433, "epoch": 0.39543063279723434, "flos": 17410452289920.0, "grad_norm": 2.196600370720259, "language_loss": 0.83569044, "learning_rate": 2.6456360688112785e-06, "loss": 0.85671371, "num_input_tokens_seen": 141252255, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.484375, "step": 6577, "time_per_iteration": 2.3597140312194824 }, { "auxiliary_loss_clip": 0.01071332, "auxiliary_loss_mlp": 0.01029181, "balance_loss_clip": 1.0148046, "balance_loss_mlp": 1.02414203, "epoch": 0.3954907560499023, "flos": 22783096172160.0, "grad_norm": 2.4305163458615993, "language_loss": 0.89513612, "learning_rate": 2.6452784631765117e-06, "loss": 0.91614127, "num_input_tokens_seen": 141269325, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.47265625, "step": 6578, "time_per_iteration": 2.3956236839294434 }, { "auxiliary_loss_clip": 0.01074793, "auxiliary_loss_mlp": 0.01031306, "balance_loss_clip": 1.01591635, "balance_loss_mlp": 1.02413046, "epoch": 0.39555087930257027, "flos": 21943330306560.0, "grad_norm": 1.8394691599252675, "language_loss": 0.78009337, "learning_rate": 2.6449208345134174e-06, "loss": 0.80115438, "num_input_tokens_seen": 141288505, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.5078125, "step": 6579, "time_per_iteration": 3.8065104484558105 }, { "auxiliary_loss_clip": 0.01073089, "auxiliary_loss_mlp": 0.01031146, "balance_loss_clip": 1.01633966, "balance_loss_mlp": 1.02277017, "epoch": 0.39561100255523823, "flos": 20403800593920.0, "grad_norm": 1.9642206333441918, "language_loss": 0.68220782, "learning_rate": 2.6445631828347566e-06, "loss": 0.70325017, "num_input_tokens_seen": 141303680, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.50390625, "step": 6580, "time_per_iteration": 3.7811038494110107 }, { "auxiliary_loss_clip": 0.01071221, "auxiliary_loss_mlp": 0.01030542, "balance_loss_clip": 1.01605809, "balance_loss_mlp": 1.02261019, "epoch": 0.3956711258079062, "flos": 27963334218240.0, "grad_norm": 2.2522349701773847, "language_loss": 0.58720434, "learning_rate": 2.644205508153295e-06, "loss": 0.60822201, "num_input_tokens_seen": 141324090, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.48632812, "step": 6581, "time_per_iteration": 2.4350225925445557 }, { "auxiliary_loss_clip": 0.01076462, "auxiliary_loss_mlp": 0.01032451, "balance_loss_clip": 1.01679897, "balance_loss_mlp": 1.02452028, "epoch": 0.39573124906057416, "flos": 14427437748480.0, "grad_norm": 1.775692491099468, "language_loss": 0.69364727, "learning_rate": 2.6438478104817953e-06, "loss": 0.7147364, "num_input_tokens_seen": 141342235, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.51953125, "step": 6582, "time_per_iteration": 2.395512580871582 }, { "auxiliary_loss_clip": 0.0107273, "auxiliary_loss_mlp": 0.01030377, "balance_loss_clip": 1.01527953, "balance_loss_mlp": 1.02370632, "epoch": 0.39579137231324213, "flos": 18732717411840.0, "grad_norm": 3.289534970259823, "language_loss": 0.75627226, "learning_rate": 2.643490089833023e-06, "loss": 0.77730334, "num_input_tokens_seen": 141361195, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.49023438, "step": 6583, "time_per_iteration": 2.370741367340088 }, { "auxiliary_loss_clip": 0.01072094, "auxiliary_loss_mlp": 0.01032853, "balance_loss_clip": 1.0186677, "balance_loss_mlp": 1.02330935, "epoch": 0.3958514955659101, "flos": 17675442576000.0, "grad_norm": 1.8348948649214971, "language_loss": 0.65808529, "learning_rate": 2.6431323462197453e-06, "loss": 0.67913473, "num_input_tokens_seen": 141378275, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.48828125, "step": 6584, "time_per_iteration": 3.7506649494171143 }, { "auxiliary_loss_clip": 0.01074744, "auxiliary_loss_mlp": 0.01038771, "balance_loss_clip": 1.02268934, "balance_loss_mlp": 1.02272248, "epoch": 0.39591161881857806, "flos": 29307979388160.0, "grad_norm": 1.9879930838699553, "language_loss": 0.72674608, "learning_rate": 2.642774579654728e-06, "loss": 0.74788117, "num_input_tokens_seen": 141396960, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.51953125, "step": 6585, "time_per_iteration": 2.459266424179077 }, { "auxiliary_loss_clip": 0.01073069, "auxiliary_loss_mlp": 0.01027575, "balance_loss_clip": 1.01352, "balance_loss_mlp": 1.02397621, "epoch": 0.3959717420712461, "flos": 25770753924480.0, "grad_norm": 1.7278749965790197, "language_loss": 0.73095381, "learning_rate": 2.6424167901507393e-06, "loss": 0.75196028, "num_input_tokens_seen": 141417320, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.49023438, "step": 6586, "time_per_iteration": 2.4353652000427246 }, { "auxiliary_loss_clip": 0.01072521, "auxiliary_loss_mlp": 0.01030391, "balance_loss_clip": 1.01594853, "balance_loss_mlp": 1.02377987, "epoch": 0.39603186532391405, "flos": 20922714264960.0, "grad_norm": 1.6781280423681042, "language_loss": 0.71794808, "learning_rate": 2.6420589777205483e-06, "loss": 0.73897719, "num_input_tokens_seen": 141435985, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.48632812, "step": 6587, "time_per_iteration": 2.405137777328491 }, { "auxiliary_loss_clip": 0.01015345, "auxiliary_loss_mlp": 0.01003776, "balance_loss_clip": 1.00241685, "balance_loss_mlp": 1.00530243, "epoch": 0.396091988576582, "flos": 54878261086080.0, "grad_norm": 0.8987258930778897, "language_loss": 0.61268282, "learning_rate": 2.641701142376924e-06, "loss": 0.63287407, "num_input_tokens_seen": 141486075, "router_z_loss_clip": 0.01361084, "router_z_loss_mlp": 0.10058594, "step": 6588, "time_per_iteration": 2.8463587760925293 }, { "auxiliary_loss_clip": 0.01072132, "auxiliary_loss_mlp": 0.01027692, "balance_loss_clip": 1.01319623, "balance_loss_mlp": 1.02277255, "epoch": 0.39615211182925, "flos": 20701888715520.0, "grad_norm": 1.6636573203544305, "language_loss": 0.8132534, "learning_rate": 2.6413432841326364e-06, "loss": 0.83425158, "num_input_tokens_seen": 141505280, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.4921875, "step": 6589, "time_per_iteration": 2.4195492267608643 }, { "auxiliary_loss_clip": 0.01072344, "auxiliary_loss_mlp": 0.01027265, "balance_loss_clip": 1.01224494, "balance_loss_mlp": 1.0235672, "epoch": 0.39621223508191794, "flos": 20993308767360.0, "grad_norm": 4.030522095439193, "language_loss": 0.7032398, "learning_rate": 2.6409854030004564e-06, "loss": 0.72423589, "num_input_tokens_seen": 141523930, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.48632812, "step": 6590, "time_per_iteration": 2.3825924396514893 }, { "auxiliary_loss_clip": 0.01073648, "auxiliary_loss_mlp": 0.01029856, "balance_loss_clip": 1.01562881, "balance_loss_mlp": 1.0243094, "epoch": 0.3962723583345859, "flos": 23367681843840.0, "grad_norm": 1.7927683030698112, "language_loss": 0.76021945, "learning_rate": 2.640627498993157e-06, "loss": 0.78125453, "num_input_tokens_seen": 141541320, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.4921875, "step": 6591, "time_per_iteration": 2.378952741622925 }, { "auxiliary_loss_clip": 0.0107514, "auxiliary_loss_mlp": 0.01039716, "balance_loss_clip": 1.0240761, "balance_loss_mlp": 1.02498293, "epoch": 0.39633248158725387, "flos": 25114526409600.0, "grad_norm": 2.0173311176285207, "language_loss": 0.78368616, "learning_rate": 2.6402695721235094e-06, "loss": 0.80483472, "num_input_tokens_seen": 141561880, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.5, "step": 6592, "time_per_iteration": 2.427351474761963 }, { "auxiliary_loss_clip": 0.01068367, "auxiliary_loss_mlp": 0.01027567, "balance_loss_clip": 1.01445425, "balance_loss_mlp": 1.02219844, "epoch": 0.39639260483992184, "flos": 39786007628160.0, "grad_norm": 2.0917031031685687, "language_loss": 0.69307292, "learning_rate": 2.6399116224042875e-06, "loss": 0.71403217, "num_input_tokens_seen": 141586460, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.4609375, "step": 6593, "time_per_iteration": 2.5655195713043213 }, { "auxiliary_loss_clip": 0.01076396, "auxiliary_loss_mlp": 0.01032775, "balance_loss_clip": 1.01721835, "balance_loss_mlp": 1.02364969, "epoch": 0.3964527280925898, "flos": 17346106920960.0, "grad_norm": 1.6529949247531577, "language_loss": 0.77677226, "learning_rate": 2.6395536498482666e-06, "loss": 0.79786396, "num_input_tokens_seen": 141605955, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.52734375, "step": 6594, "time_per_iteration": 2.421786069869995 }, { "auxiliary_loss_clip": 0.01013301, "auxiliary_loss_mlp": 0.01002876, "balance_loss_clip": 1.00164258, "balance_loss_mlp": 1.00343752, "epoch": 0.39651285134525777, "flos": 71714182176000.0, "grad_norm": 0.9507616657036345, "language_loss": 0.63048959, "learning_rate": 2.6391956544682205e-06, "loss": 0.6506514, "num_input_tokens_seen": 141673140, "router_z_loss_clip": 0.0123291, "router_z_loss_mlp": 0.09863281, "step": 6595, "time_per_iteration": 3.070241689682007 }, { "auxiliary_loss_clip": 0.01077312, "auxiliary_loss_mlp": 0.01042543, "balance_loss_clip": 1.02587748, "balance_loss_mlp": 1.02509665, "epoch": 0.39657297459792573, "flos": 25774524351360.0, "grad_norm": 2.095917998493346, "language_loss": 0.63511324, "learning_rate": 2.6388376362769258e-06, "loss": 0.65631175, "num_input_tokens_seen": 141692955, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.5234375, "step": 6596, "time_per_iteration": 2.433924436569214 }, { "auxiliary_loss_clip": 0.01070177, "auxiliary_loss_mlp": 0.01035416, "balance_loss_clip": 1.02105105, "balance_loss_mlp": 1.0235101, "epoch": 0.3966330978505937, "flos": 20265090295680.0, "grad_norm": 2.288815085915132, "language_loss": 0.78737879, "learning_rate": 2.638479595287159e-06, "loss": 0.80843472, "num_input_tokens_seen": 141710680, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.46679688, "step": 6597, "time_per_iteration": 2.441993474960327 }, { "auxiliary_loss_clip": 0.01075419, "auxiliary_loss_mlp": 0.01031484, "balance_loss_clip": 1.01642215, "balance_loss_mlp": 1.02461541, "epoch": 0.39669322110326166, "flos": 20630142138240.0, "grad_norm": 2.048859630911311, "language_loss": 0.67515361, "learning_rate": 2.638121531511698e-06, "loss": 0.69622266, "num_input_tokens_seen": 141729860, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.5078125, "step": 6598, "time_per_iteration": 2.5750367641448975 }, { "auxiliary_loss_clip": 0.01073735, "auxiliary_loss_mlp": 0.01032783, "balance_loss_clip": 1.01866269, "balance_loss_mlp": 1.02414203, "epoch": 0.3967533443559297, "flos": 21724983463680.0, "grad_norm": 1.665831349375174, "language_loss": 0.78801513, "learning_rate": 2.637763444963321e-06, "loss": 0.80908036, "num_input_tokens_seen": 141749060, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.49609375, "step": 6599, "time_per_iteration": 2.4718222618103027 }, { "auxiliary_loss_clip": 0.01073288, "auxiliary_loss_mlp": 0.01028626, "balance_loss_clip": 1.01309323, "balance_loss_mlp": 1.02418017, "epoch": 0.39681346760859765, "flos": 25482964654080.0, "grad_norm": 2.4949119101660076, "language_loss": 0.72558117, "learning_rate": 2.637405335654807e-06, "loss": 0.74660027, "num_input_tokens_seen": 141769860, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.4921875, "step": 6600, "time_per_iteration": 2.468285083770752 }, { "auxiliary_loss_clip": 0.01072041, "auxiliary_loss_mlp": 0.01030601, "balance_loss_clip": 1.01596189, "balance_loss_mlp": 1.02299666, "epoch": 0.3968735908612656, "flos": 20958535186560.0, "grad_norm": 2.1987392358751072, "language_loss": 0.84941244, "learning_rate": 2.6370472035989367e-06, "loss": 0.87043887, "num_input_tokens_seen": 141788465, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.49023438, "step": 6601, "time_per_iteration": 2.467297077178955 }, { "auxiliary_loss_clip": 0.01079397, "auxiliary_loss_mlp": 0.0103755, "balance_loss_clip": 1.02028847, "balance_loss_mlp": 1.02517986, "epoch": 0.3969337141139336, "flos": 10706324820480.0, "grad_norm": 12.416557919366461, "language_loss": 0.70184731, "learning_rate": 2.6366890488084897e-06, "loss": 0.72301674, "num_input_tokens_seen": 141804955, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.54296875, "step": 6602, "time_per_iteration": 2.425172805786133 }, { "auxiliary_loss_clip": 0.01073373, "auxiliary_loss_mlp": 0.01036399, "balance_loss_clip": 1.02181971, "balance_loss_mlp": 1.02396977, "epoch": 0.39699383736660154, "flos": 17593013122560.0, "grad_norm": 2.2917741666065705, "language_loss": 0.83603591, "learning_rate": 2.636330871296249e-06, "loss": 0.85713363, "num_input_tokens_seen": 141820025, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.49414062, "step": 6603, "time_per_iteration": 2.417780637741089 }, { "auxiliary_loss_clip": 0.01071579, "auxiliary_loss_mlp": 0.01028203, "balance_loss_clip": 1.01385045, "balance_loss_mlp": 1.02388239, "epoch": 0.3970539606192695, "flos": 17784965111040.0, "grad_norm": 1.535697167451568, "language_loss": 0.7330876, "learning_rate": 2.635972671074996e-06, "loss": 0.75408542, "num_input_tokens_seen": 141838735, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.4765625, "step": 6604, "time_per_iteration": 2.4277050495147705 }, { "auxiliary_loss_clip": 0.01069355, "auxiliary_loss_mlp": 0.01028661, "balance_loss_clip": 1.01400399, "balance_loss_mlp": 1.02336073, "epoch": 0.3971140838719375, "flos": 24788367688320.0, "grad_norm": 2.023280842428495, "language_loss": 0.82218075, "learning_rate": 2.6356144481575144e-06, "loss": 0.84316093, "num_input_tokens_seen": 141858090, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.4609375, "step": 6605, "time_per_iteration": 2.429494857788086 }, { "auxiliary_loss_clip": 0.010693, "auxiliary_loss_mlp": 0.01025142, "balance_loss_clip": 1.01207614, "balance_loss_mlp": 1.02218819, "epoch": 0.39717420712460544, "flos": 24242430758400.0, "grad_norm": 1.5420859583299282, "language_loss": 0.73923743, "learning_rate": 2.6352562025565885e-06, "loss": 0.7601819, "num_input_tokens_seen": 141877540, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.47070312, "step": 6606, "time_per_iteration": 2.4811911582946777 }, { "auxiliary_loss_clip": 0.0107577, "auxiliary_loss_mlp": 0.01030118, "balance_loss_clip": 1.01478803, "balance_loss_mlp": 1.02628827, "epoch": 0.3972343303772734, "flos": 25883523216000.0, "grad_norm": 2.0636722669012992, "language_loss": 0.73940217, "learning_rate": 2.634897934285002e-06, "loss": 0.76046103, "num_input_tokens_seen": 141897315, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.49414062, "step": 6607, "time_per_iteration": 2.4875147342681885 }, { "auxiliary_loss_clip": 0.01074223, "auxiliary_loss_mlp": 0.01033751, "balance_loss_clip": 1.01935089, "balance_loss_mlp": 1.02472949, "epoch": 0.39729445362994137, "flos": 45621984430080.0, "grad_norm": 2.1273720981107083, "language_loss": 0.67869693, "learning_rate": 2.6345396433555415e-06, "loss": 0.69977665, "num_input_tokens_seen": 141919580, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.49414062, "step": 6608, "time_per_iteration": 2.6384451389312744 }, { "auxiliary_loss_clip": 0.0107368, "auxiliary_loss_mlp": 0.01031088, "balance_loss_clip": 1.01509643, "balance_loss_mlp": 1.0232302, "epoch": 0.39735457688260933, "flos": 20192924782080.0, "grad_norm": 1.9367330876541697, "language_loss": 0.74092335, "learning_rate": 2.6341813297809937e-06, "loss": 0.76197106, "num_input_tokens_seen": 141937045, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.50390625, "step": 6609, "time_per_iteration": 2.423671245574951 }, { "auxiliary_loss_clip": 0.01074604, "auxiliary_loss_mlp": 0.01027454, "balance_loss_clip": 1.0122726, "balance_loss_mlp": 1.02458334, "epoch": 0.3974147001352773, "flos": 23330045531520.0, "grad_norm": 1.9670779569865173, "language_loss": 0.71762049, "learning_rate": 2.633822993574145e-06, "loss": 0.73864102, "num_input_tokens_seen": 141956695, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.5, "step": 6610, "time_per_iteration": 2.447556972503662 }, { "auxiliary_loss_clip": 0.0106819, "auxiliary_loss_mlp": 0.01029794, "balance_loss_clip": 1.01582897, "balance_loss_mlp": 1.02203941, "epoch": 0.39747482338794526, "flos": 21687591530880.0, "grad_norm": 1.4957365240001774, "language_loss": 0.78553832, "learning_rate": 2.633464634747785e-06, "loss": 0.8065182, "num_input_tokens_seen": 141975935, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.4609375, "step": 6611, "time_per_iteration": 2.4243054389953613 }, { "auxiliary_loss_clip": 0.01074559, "auxiliary_loss_mlp": 0.01030905, "balance_loss_clip": 1.01669502, "balance_loss_mlp": 1.02427948, "epoch": 0.3975349466406133, "flos": 30987511119360.0, "grad_norm": 2.294494954631813, "language_loss": 0.79610085, "learning_rate": 2.6331062533147002e-06, "loss": 0.81715554, "num_input_tokens_seen": 141995750, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.50390625, "step": 6612, "time_per_iteration": 2.50923752784729 }, { "auxiliary_loss_clip": 0.01012531, "auxiliary_loss_mlp": 0.00999866, "balance_loss_clip": 0.99842912, "balance_loss_mlp": 1.00267804, "epoch": 0.39759506989328125, "flos": 63680702578560.0, "grad_norm": 0.8377082850529332, "language_loss": 0.6494652, "learning_rate": 2.632747849287683e-06, "loss": 0.66958922, "num_input_tokens_seen": 142057655, "router_z_loss_clip": 0.01434326, "router_z_loss_mlp": 0.09863281, "step": 6613, "time_per_iteration": 4.366408109664917 }, { "auxiliary_loss_clip": 0.01072996, "auxiliary_loss_mlp": 0.01032572, "balance_loss_clip": 1.01753378, "balance_loss_mlp": 1.02373576, "epoch": 0.3976551931459492, "flos": 23694713349120.0, "grad_norm": 2.3277149819818406, "language_loss": 0.71182394, "learning_rate": 2.632389422679523e-06, "loss": 0.73287964, "num_input_tokens_seen": 142076020, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.4921875, "step": 6614, "time_per_iteration": 2.449972152709961 }, { "auxiliary_loss_clip": 0.01074282, "auxiliary_loss_mlp": 0.01028998, "balance_loss_clip": 1.01391792, "balance_loss_mlp": 1.02487075, "epoch": 0.3977153163986172, "flos": 15668739694080.0, "grad_norm": 2.2532170211440516, "language_loss": 0.81478083, "learning_rate": 2.63203097350301e-06, "loss": 0.83581364, "num_input_tokens_seen": 142093790, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.49414062, "step": 6615, "time_per_iteration": 2.4869959354400635 }, { "auxiliary_loss_clip": 0.01070477, "auxiliary_loss_mlp": 0.01027059, "balance_loss_clip": 1.01311135, "balance_loss_mlp": 1.02264977, "epoch": 0.39777543965128515, "flos": 14063817271680.0, "grad_norm": 1.7476188459762432, "language_loss": 0.66889179, "learning_rate": 2.631672501770938e-06, "loss": 0.68986714, "num_input_tokens_seen": 142110545, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.47851562, "step": 6616, "time_per_iteration": 2.449387311935425 }, { "auxiliary_loss_clip": 0.01074761, "auxiliary_loss_mlp": 0.01035294, "balance_loss_clip": 1.01970732, "balance_loss_mlp": 1.02320814, "epoch": 0.3978355629039531, "flos": 23366355212160.0, "grad_norm": 2.75083159245793, "language_loss": 0.8351993, "learning_rate": 2.631314007496099e-06, "loss": 0.85629982, "num_input_tokens_seen": 142128695, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.515625, "step": 6617, "time_per_iteration": 2.4497227668762207 }, { "auxiliary_loss_clip": 0.01069706, "auxiliary_loss_mlp": 0.0103205, "balance_loss_clip": 1.01942551, "balance_loss_mlp": 1.02357531, "epoch": 0.3978956861566211, "flos": 19061773776000.0, "grad_norm": 1.4966178189274915, "language_loss": 0.71953106, "learning_rate": 2.6309554906912873e-06, "loss": 0.74054861, "num_input_tokens_seen": 142148375, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.4609375, "step": 6618, "time_per_iteration": 3.8436081409454346 }, { "auxiliary_loss_clip": 0.01073663, "auxiliary_loss_mlp": 0.01028251, "balance_loss_clip": 1.01286101, "balance_loss_mlp": 1.02433228, "epoch": 0.39795580940928904, "flos": 30226369368960.0, "grad_norm": 1.8691823070633389, "language_loss": 0.65258539, "learning_rate": 2.6305969513692965e-06, "loss": 0.67360455, "num_input_tokens_seen": 142169735, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.4921875, "step": 6619, "time_per_iteration": 2.5060930252075195 }, { "auxiliary_loss_clip": 0.01012081, "auxiliary_loss_mlp": 0.01003052, "balance_loss_clip": 1.00174701, "balance_loss_mlp": 1.00220847, "epoch": 0.398015932661957, "flos": 69843885442560.0, "grad_norm": 0.8225231916317525, "language_loss": 0.58208799, "learning_rate": 2.630238389542924e-06, "loss": 0.60223931, "num_input_tokens_seen": 142229520, "router_z_loss_clip": 0.01306152, "router_z_loss_mlp": 0.09863281, "step": 6620, "time_per_iteration": 4.43730354309082 }, { "auxiliary_loss_clip": 0.01072964, "auxiliary_loss_mlp": 0.01032571, "balance_loss_clip": 1.01952314, "balance_loss_mlp": 1.02452278, "epoch": 0.39807605591462497, "flos": 20156719835520.0, "grad_norm": 1.6378975711741601, "language_loss": 0.78978765, "learning_rate": 2.629879805224964e-06, "loss": 0.81084305, "num_input_tokens_seen": 142247660, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.484375, "step": 6621, "time_per_iteration": 2.4630956649780273 }, { "auxiliary_loss_clip": 0.01071045, "auxiliary_loss_mlp": 0.01026801, "balance_loss_clip": 1.01306832, "balance_loss_mlp": 1.0235759, "epoch": 0.39813617916729294, "flos": 21140711994240.0, "grad_norm": 2.657284676416563, "language_loss": 0.78505963, "learning_rate": 2.629521198428213e-06, "loss": 0.80603814, "num_input_tokens_seen": 142266990, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.47460938, "step": 6622, "time_per_iteration": 2.426262855529785 }, { "auxiliary_loss_clip": 0.01071193, "auxiliary_loss_mlp": 0.01031058, "balance_loss_clip": 1.01594281, "balance_loss_mlp": 1.0229032, "epoch": 0.3981963024199609, "flos": 18987513580800.0, "grad_norm": 1.6999053600728438, "language_loss": 0.75029296, "learning_rate": 2.6291625691654702e-06, "loss": 0.77131546, "num_input_tokens_seen": 142287170, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.484375, "step": 6623, "time_per_iteration": 3.8371999263763428 }, { "auxiliary_loss_clip": 0.01071046, "auxiliary_loss_mlp": 0.01033594, "balance_loss_clip": 1.01858592, "balance_loss_mlp": 1.02266097, "epoch": 0.39825642567262887, "flos": 16574352117120.0, "grad_norm": 1.7613877439509331, "language_loss": 0.79156911, "learning_rate": 2.6288039174495334e-06, "loss": 0.81261551, "num_input_tokens_seen": 142305405, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.484375, "step": 6624, "time_per_iteration": 2.4280402660369873 }, { "auxiliary_loss_clip": 0.01075203, "auxiliary_loss_mlp": 0.01040329, "balance_loss_clip": 1.02346134, "balance_loss_mlp": 1.02439928, "epoch": 0.39831654892529683, "flos": 22198754880000.0, "grad_norm": 1.8766172744924776, "language_loss": 0.83392131, "learning_rate": 2.6284452432932034e-06, "loss": 0.85507667, "num_input_tokens_seen": 142322710, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.5078125, "step": 6625, "time_per_iteration": 2.477288007736206 }, { "auxiliary_loss_clip": 0.01069912, "auxiliary_loss_mlp": 0.01030699, "balance_loss_clip": 1.01625037, "balance_loss_mlp": 1.02270007, "epoch": 0.39837667217796485, "flos": 10487209927680.0, "grad_norm": 2.031089026264865, "language_loss": 0.86226273, "learning_rate": 2.6280865467092787e-06, "loss": 0.88326883, "num_input_tokens_seen": 142338535, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.47265625, "step": 6626, "time_per_iteration": 2.441574811935425 }, { "auxiliary_loss_clip": 0.01073493, "auxiliary_loss_mlp": 0.01029173, "balance_loss_clip": 1.0135802, "balance_loss_mlp": 1.02473068, "epoch": 0.3984367954306328, "flos": 17964383921280.0, "grad_norm": 2.4560288013611933, "language_loss": 0.83296955, "learning_rate": 2.6277278277105604e-06, "loss": 0.85399616, "num_input_tokens_seen": 142354570, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.48828125, "step": 6627, "time_per_iteration": 2.47760272026062 }, { "auxiliary_loss_clip": 0.01069432, "auxiliary_loss_mlp": 0.01034995, "balance_loss_clip": 1.0210712, "balance_loss_mlp": 1.02272487, "epoch": 0.3984969186833008, "flos": 22709953140480.0, "grad_norm": 1.539132968554749, "language_loss": 0.82766044, "learning_rate": 2.627369086309851e-06, "loss": 0.84870476, "num_input_tokens_seen": 142374395, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.46679688, "step": 6628, "time_per_iteration": 2.5086874961853027 }, { "auxiliary_loss_clip": 0.01073103, "auxiliary_loss_mlp": 0.01035979, "balance_loss_clip": 1.02126884, "balance_loss_mlp": 1.02380443, "epoch": 0.39855704193596875, "flos": 23404619928960.0, "grad_norm": 1.6556971975652655, "language_loss": 0.71342582, "learning_rate": 2.6270103225199524e-06, "loss": 0.73451662, "num_input_tokens_seen": 142396040, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.4921875, "step": 6629, "time_per_iteration": 2.491774082183838 }, { "auxiliary_loss_clip": 0.010727, "auxiliary_loss_mlp": 0.01032295, "balance_loss_clip": 1.01768541, "balance_loss_mlp": 1.02521884, "epoch": 0.3986171651886367, "flos": 21250862933760.0, "grad_norm": 1.7014086988520736, "language_loss": 0.80730289, "learning_rate": 2.626651536353668e-06, "loss": 0.82835281, "num_input_tokens_seen": 142415495, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.47460938, "step": 6630, "time_per_iteration": 2.4837019443511963 }, { "auxiliary_loss_clip": 0.01074594, "auxiliary_loss_mlp": 0.01022779, "balance_loss_clip": 1.00870633, "balance_loss_mlp": 1.02513838, "epoch": 0.3986772884413047, "flos": 12457882419840.0, "grad_norm": 1.8134919079533278, "language_loss": 0.74934614, "learning_rate": 2.6262927278238032e-06, "loss": 0.77031994, "num_input_tokens_seen": 142431865, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.49414062, "step": 6631, "time_per_iteration": 2.435589551925659 }, { "auxiliary_loss_clip": 0.0107291, "auxiliary_loss_mlp": 0.01031434, "balance_loss_clip": 1.01608586, "balance_loss_mlp": 1.0240016, "epoch": 0.39873741169397264, "flos": 19645102638720.0, "grad_norm": 2.0932890864924847, "language_loss": 0.7138592, "learning_rate": 2.6259338969431613e-06, "loss": 0.73490262, "num_input_tokens_seen": 142450595, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.48828125, "step": 6632, "time_per_iteration": 2.533177137374878 }, { "auxiliary_loss_clip": 0.01068979, "auxiliary_loss_mlp": 0.01030101, "balance_loss_clip": 1.01622534, "balance_loss_mlp": 1.0216732, "epoch": 0.3987975349466406, "flos": 21683821104000.0, "grad_norm": 1.7749245307922714, "language_loss": 0.75056088, "learning_rate": 2.6255750437245487e-06, "loss": 0.77155167, "num_input_tokens_seen": 142466650, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.47460938, "step": 6633, "time_per_iteration": 2.5153300762176514 }, { "auxiliary_loss_clip": 0.01070432, "auxiliary_loss_mlp": 0.01024752, "balance_loss_clip": 1.01084614, "balance_loss_mlp": 1.02195311, "epoch": 0.3988576581993086, "flos": 23912955457920.0, "grad_norm": 1.7628208212191583, "language_loss": 0.81495905, "learning_rate": 2.625216168180772e-06, "loss": 0.83591092, "num_input_tokens_seen": 142486165, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.484375, "step": 6634, "time_per_iteration": 2.4775242805480957 }, { "auxiliary_loss_clip": 0.01072265, "auxiliary_loss_mlp": 0.01030621, "balance_loss_clip": 1.01656055, "balance_loss_mlp": 1.02456665, "epoch": 0.39891778145197654, "flos": 18148934701440.0, "grad_norm": 1.671007261610527, "language_loss": 0.74796832, "learning_rate": 2.624857270324639e-06, "loss": 0.76899719, "num_input_tokens_seen": 142505035, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.4765625, "step": 6635, "time_per_iteration": 2.4950473308563232 }, { "auxiliary_loss_clip": 0.01076285, "auxiliary_loss_mlp": 0.01035496, "balance_loss_clip": 1.0191946, "balance_loss_mlp": 1.02347112, "epoch": 0.3989779047046445, "flos": 22594356028800.0, "grad_norm": 2.9618247065671524, "language_loss": 0.66572481, "learning_rate": 2.6244983501689574e-06, "loss": 0.68684262, "num_input_tokens_seen": 142521870, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.52734375, "step": 6636, "time_per_iteration": 2.53560209274292 }, { "auxiliary_loss_clip": 0.01071293, "auxiliary_loss_mlp": 0.01029443, "balance_loss_clip": 1.01565683, "balance_loss_mlp": 1.02332687, "epoch": 0.39903802795731247, "flos": 18076245517440.0, "grad_norm": 2.7573624566813977, "language_loss": 0.81554866, "learning_rate": 2.6241394077265352e-06, "loss": 0.83655596, "num_input_tokens_seen": 142540455, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.48046875, "step": 6637, "time_per_iteration": 2.467667579650879 }, { "auxiliary_loss_clip": 0.01011698, "auxiliary_loss_mlp": 0.01002102, "balance_loss_clip": 1.00095117, "balance_loss_mlp": 1.00210905, "epoch": 0.39909815120998043, "flos": 70437722624640.0, "grad_norm": 0.7138867261636817, "language_loss": 0.53198743, "learning_rate": 2.6237804430101835e-06, "loss": 0.55212545, "num_input_tokens_seen": 142599665, "router_z_loss_clip": 0.01147461, "router_z_loss_mlp": 0.09570312, "step": 6638, "time_per_iteration": 3.1159331798553467 }, { "auxiliary_loss_clip": 0.01072711, "auxiliary_loss_mlp": 0.0102694, "balance_loss_clip": 1.01229489, "balance_loss_mlp": 1.02373385, "epoch": 0.39915827446264845, "flos": 18548341188480.0, "grad_norm": 1.6126167806358793, "language_loss": 0.75386608, "learning_rate": 2.623421456032712e-06, "loss": 0.77486265, "num_input_tokens_seen": 142618845, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.49023438, "step": 6639, "time_per_iteration": 2.4533066749572754 }, { "auxiliary_loss_clip": 0.01069403, "auxiliary_loss_mlp": 0.01030946, "balance_loss_clip": 1.01736772, "balance_loss_mlp": 1.02293527, "epoch": 0.3992183977153164, "flos": 29895986373120.0, "grad_norm": 5.721938861641512, "language_loss": 0.76002169, "learning_rate": 2.6230624468069326e-06, "loss": 0.78102523, "num_input_tokens_seen": 142640885, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.46484375, "step": 6640, "time_per_iteration": 2.5051097869873047 }, { "auxiliary_loss_clip": 0.01073513, "auxiliary_loss_mlp": 0.01031181, "balance_loss_clip": 1.0161314, "balance_loss_mlp": 1.02374053, "epoch": 0.3992785209679844, "flos": 22563981279360.0, "grad_norm": 2.46462815576201, "language_loss": 0.8213411, "learning_rate": 2.6227034153456573e-06, "loss": 0.84238803, "num_input_tokens_seen": 142659340, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.49804688, "step": 6641, "time_per_iteration": 2.5071916580200195 }, { "auxiliary_loss_clip": 0.01073203, "auxiliary_loss_mlp": 0.01031901, "balance_loss_clip": 1.01683319, "balance_loss_mlp": 1.02350163, "epoch": 0.39933864422065235, "flos": 19681656698880.0, "grad_norm": 2.5993960764360624, "language_loss": 0.76370227, "learning_rate": 2.6223443616616985e-06, "loss": 0.78475332, "num_input_tokens_seen": 142677085, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.49804688, "step": 6642, "time_per_iteration": 2.466419219970703 }, { "auxiliary_loss_clip": 0.0107244, "auxiliary_loss_mlp": 0.01031026, "balance_loss_clip": 1.01634574, "balance_loss_mlp": 1.02371955, "epoch": 0.3993987674733203, "flos": 23037403582080.0, "grad_norm": 1.8469196847473455, "language_loss": 0.72247189, "learning_rate": 2.621985285767871e-06, "loss": 0.74350661, "num_input_tokens_seen": 142694595, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.48828125, "step": 6643, "time_per_iteration": 2.4636638164520264 }, { "auxiliary_loss_clip": 0.01072605, "auxiliary_loss_mlp": 0.01028365, "balance_loss_clip": 1.01352334, "balance_loss_mlp": 1.02277112, "epoch": 0.3994588907259883, "flos": 19389817710720.0, "grad_norm": 1.7385154055731078, "language_loss": 0.66287923, "learning_rate": 2.621626187676988e-06, "loss": 0.68388891, "num_input_tokens_seen": 142714175, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.49804688, "step": 6644, "time_per_iteration": 2.4885880947113037 }, { "auxiliary_loss_clip": 0.01071197, "auxiliary_loss_mlp": 0.01033215, "balance_loss_clip": 1.01815891, "balance_loss_mlp": 1.02299833, "epoch": 0.39951901397865625, "flos": 13733573921280.0, "grad_norm": 2.2930590810623346, "language_loss": 0.78312695, "learning_rate": 2.6212670674018657e-06, "loss": 0.80417103, "num_input_tokens_seen": 142730955, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.48242188, "step": 6645, "time_per_iteration": 2.4476451873779297 }, { "auxiliary_loss_clip": 0.01072112, "auxiliary_loss_mlp": 0.01035208, "balance_loss_clip": 1.02000928, "balance_loss_mlp": 1.02392793, "epoch": 0.3995791372313242, "flos": 23585330459520.0, "grad_norm": 3.9877109267143065, "language_loss": 0.70151591, "learning_rate": 2.6209079249553195e-06, "loss": 0.72258914, "num_input_tokens_seen": 142751200, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.48046875, "step": 6646, "time_per_iteration": 2.4883899688720703 }, { "auxiliary_loss_clip": 0.01070617, "auxiliary_loss_mlp": 0.01032113, "balance_loss_clip": 1.01739049, "balance_loss_mlp": 1.02275634, "epoch": 0.3996392604839922, "flos": 21354974208000.0, "grad_norm": 2.344451303145302, "language_loss": 0.71729481, "learning_rate": 2.6205487603501672e-06, "loss": 0.73832202, "num_input_tokens_seen": 142770170, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.47851562, "step": 6647, "time_per_iteration": 2.4864189624786377 }, { "auxiliary_loss_clip": 0.01068436, "auxiliary_loss_mlp": 0.01030012, "balance_loss_clip": 1.01629114, "balance_loss_mlp": 1.0223968, "epoch": 0.39969938373666014, "flos": 26031031176960.0, "grad_norm": 1.5577673638209417, "language_loss": 0.74076974, "learning_rate": 2.6201895735992255e-06, "loss": 0.76175427, "num_input_tokens_seen": 142792680, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.4609375, "step": 6648, "time_per_iteration": 2.5327861309051514 }, { "auxiliary_loss_clip": 0.01073442, "auxiliary_loss_mlp": 0.01028694, "balance_loss_clip": 1.01361442, "balance_loss_mlp": 1.02304292, "epoch": 0.3997595069893281, "flos": 20115452741760.0, "grad_norm": 3.683464481059715, "language_loss": 0.66093767, "learning_rate": 2.6198303647153133e-06, "loss": 0.68195903, "num_input_tokens_seen": 142810510, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.50390625, "step": 6649, "time_per_iteration": 2.4660842418670654 }, { "auxiliary_loss_clip": 0.01075217, "auxiliary_loss_mlp": 0.01032727, "balance_loss_clip": 1.01812959, "balance_loss_mlp": 1.02606964, "epoch": 0.39981963024199607, "flos": 27782134928640.0, "grad_norm": 1.503133258693868, "language_loss": 0.75100648, "learning_rate": 2.61947113371125e-06, "loss": 0.77208591, "num_input_tokens_seen": 142832455, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.49023438, "step": 6650, "time_per_iteration": 2.4915287494659424 }, { "auxiliary_loss_clip": 0.01072812, "auxiliary_loss_mlp": 0.01032197, "balance_loss_clip": 1.01808834, "balance_loss_mlp": 1.02349114, "epoch": 0.39987975349466404, "flos": 21943365217920.0, "grad_norm": 1.5056294246379212, "language_loss": 0.72204274, "learning_rate": 2.6191118805998547e-06, "loss": 0.74309278, "num_input_tokens_seen": 142852590, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.4921875, "step": 6651, "time_per_iteration": 2.467857837677002 }, { "auxiliary_loss_clip": 0.01072487, "auxiliary_loss_mlp": 0.01031596, "balance_loss_clip": 1.01676691, "balance_loss_mlp": 1.02346241, "epoch": 0.39993987674733206, "flos": 20703354992640.0, "grad_norm": 1.9292758740306415, "language_loss": 0.72752506, "learning_rate": 2.6187526053939497e-06, "loss": 0.74856591, "num_input_tokens_seen": 142870595, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.49023438, "step": 6652, "time_per_iteration": 2.442746877670288 }, { "auxiliary_loss_clip": 0.01011719, "auxiliary_loss_mlp": 0.01000631, "balance_loss_clip": 0.99931401, "balance_loss_mlp": 1.00189078, "epoch": 0.4, "flos": 61522407106560.0, "grad_norm": 0.8483957684346403, "language_loss": 0.60638869, "learning_rate": 2.6183933081063556e-06, "loss": 0.62651217, "num_input_tokens_seen": 142925805, "router_z_loss_clip": 0.01318359, "router_z_loss_mlp": 0.09863281, "step": 6653, "time_per_iteration": 4.38383674621582 }, { "auxiliary_loss_clip": 0.010707, "auxiliary_loss_mlp": 0.0103061, "balance_loss_clip": 1.0166986, "balance_loss_mlp": 1.02513754, "epoch": 0.400060123252668, "flos": 14501418652800.0, "grad_norm": 2.0324203839737622, "language_loss": 0.67108071, "learning_rate": 2.6180339887498946e-06, "loss": 0.69209385, "num_input_tokens_seen": 142943145, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.45507812, "step": 6654, "time_per_iteration": 2.475163698196411 }, { "auxiliary_loss_clip": 0.0107101, "auxiliary_loss_mlp": 0.01032268, "balance_loss_clip": 1.01865411, "balance_loss_mlp": 1.02303898, "epoch": 0.40012024650533595, "flos": 19092462727680.0, "grad_norm": 2.1088906145080926, "language_loss": 0.89763999, "learning_rate": 2.617674647337391e-06, "loss": 0.9186728, "num_input_tokens_seen": 142956925, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.48046875, "step": 6655, "time_per_iteration": 2.4557688236236572 }, { "auxiliary_loss_clip": 0.01069039, "auxiliary_loss_mlp": 0.01026128, "balance_loss_clip": 1.01370692, "balance_loss_mlp": 1.02378738, "epoch": 0.4001803697580039, "flos": 29349735240960.0, "grad_norm": 1.5996997307484297, "language_loss": 0.73221993, "learning_rate": 2.6173152838816673e-06, "loss": 0.75317162, "num_input_tokens_seen": 142978040, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.453125, "step": 6656, "time_per_iteration": 3.932459831237793 } ], "logging_steps": 1.0, "max_steps": 16632, "num_input_tokens_seen": 142978040, "num_train_epochs": 1, "save_steps": 3328, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.575514046259528e+17, "train_batch_size": 5, "trial_name": null, "trial_params": null }